2022-08-16 18:13:00 +02:00
|
|
|
#!/usr/bin/python3
|
|
|
|
"""
|
|
|
|
Module implémentant la recherche de partitions par mots-clés
|
|
|
|
"""
|
2022-08-17 09:44:31 +02:00
|
|
|
from uuid import uuid4
|
|
|
|
import urllib.request
|
2023-08-28 14:14:57 +02:00
|
|
|
import threading
|
2023-10-18 14:45:40 +02:00
|
|
|
import socket
|
2022-08-17 09:44:31 +02:00
|
|
|
import os
|
2022-08-16 18:13:00 +02:00
|
|
|
|
2024-02-21 15:53:49 +01:00
|
|
|
import pypdf
|
2022-08-17 09:44:31 +02:00
|
|
|
import googlesearch
|
2024-06-17 21:06:28 +02:00
|
|
|
from unidecode import unidecode
|
2022-08-17 09:44:31 +02:00
|
|
|
|
|
|
|
from .db import get_db
|
|
|
|
|
2023-10-18 14:45:40 +02:00
|
|
|
socket.setdefaulttimeout(5) # Maximum time before we give up on downloading a file (dead url)
|
|
|
|
|
2022-08-17 09:44:31 +02:00
|
|
|
|
|
|
|
def local_search(query, partitions):
|
2022-08-16 18:13:00 +02:00
|
|
|
"""
|
|
|
|
Renvoie les 5 résultats les plus pertinents parmi une liste donnée
|
|
|
|
"""
|
2024-06-17 21:06:28 +02:00
|
|
|
query_words = [word.lower() for word in unidecode(query).split()]
|
2022-08-16 18:13:00 +02:00
|
|
|
def score_attribution(partition):
|
|
|
|
score = 0
|
2023-12-06 19:57:28 +01:00
|
|
|
for word in query_words:
|
2022-08-16 18:13:00 +02:00
|
|
|
if word != "":
|
2024-06-17 21:06:28 +02:00
|
|
|
if word in unidecode(partition["name"]).lower():
|
2023-12-06 19:57:28 +01:00
|
|
|
score += 6
|
2024-06-17 21:06:28 +02:00
|
|
|
elif word in unidecode(partition["author"]).lower():
|
2023-12-06 19:57:28 +01:00
|
|
|
score += 4
|
2024-06-17 21:06:28 +02:00
|
|
|
elif word in unidecode(partition["body"]).lower():
|
2022-08-16 18:13:00 +02:00
|
|
|
score += 2
|
|
|
|
else:
|
2024-06-17 21:06:28 +02:00
|
|
|
score -= 6
|
|
|
|
for word in unidecode(partition["name"]).split():
|
2023-12-06 19:57:28 +01:00
|
|
|
if word != "" and word.lower() not in query_words:
|
|
|
|
score -= 1
|
2022-08-16 18:13:00 +02:00
|
|
|
return score
|
|
|
|
|
2023-12-06 19:57:28 +01:00
|
|
|
score_partitions = [(score_attribution(partition), partition) for partition in partitions]
|
|
|
|
score_partitions.sort(key=lambda x: x[0], reverse=True)
|
2023-12-15 11:36:34 +01:00
|
|
|
|
2023-12-06 19:57:28 +01:00
|
|
|
selection = []
|
|
|
|
for score, partition in score_partitions[:5]:
|
|
|
|
if score > 0:
|
|
|
|
selection.append(partition)
|
2022-08-16 18:13:00 +02:00
|
|
|
else:
|
|
|
|
break
|
2023-12-06 19:57:28 +01:00
|
|
|
return selection
|
2022-08-17 09:44:31 +02:00
|
|
|
|
|
|
|
|
2024-01-16 18:50:19 +01:00
|
|
|
def download_search_result(element, instance_path):
|
2023-08-28 14:14:57 +02:00
|
|
|
uuid = element["uuid"]
|
|
|
|
url = element["url"]
|
2024-02-21 15:53:49 +01:00
|
|
|
filename = f"{instance_path}/search-partitions/{uuid}.pdf"
|
2023-08-28 14:14:57 +02:00
|
|
|
|
|
|
|
try:
|
2024-02-21 15:53:49 +01:00
|
|
|
urllib.request.urlretrieve(url, filename)
|
|
|
|
pypdf.PdfReader(filename)
|
2023-08-28 14:14:57 +02:00
|
|
|
|
2024-02-21 15:53:49 +01:00
|
|
|
except (urllib.error.HTTPError, urllib.error.URLError,
|
|
|
|
pypdf.errors.PdfReadError, pypdf.errors.PdfStreamError):
|
|
|
|
if os.path.exists(filename):
|
|
|
|
os.remove(filename)
|
|
|
|
with open(filename, 'a', encoding="utf8") as _:
|
2023-08-28 14:14:57 +02:00
|
|
|
pass # Create empty file
|
|
|
|
|
|
|
|
|
2024-01-16 18:50:19 +01:00
|
|
|
def online_search(query, num_queries, instance_path):
|
2022-08-17 09:44:31 +02:00
|
|
|
"""
|
2022-08-17 23:52:58 +02:00
|
|
|
Renvoie les 3 résultats les plus pertinents depuis google
|
2022-08-17 09:44:31 +02:00
|
|
|
"""
|
2022-10-28 22:18:22 +02:00
|
|
|
db = get_db()
|
2022-08-17 09:44:31 +02:00
|
|
|
query = f"partition filetype:pdf {query}"
|
|
|
|
partitions = []
|
2023-10-18 14:45:40 +02:00
|
|
|
|
|
|
|
try:
|
|
|
|
results = googlesearch.search(
|
|
|
|
query,
|
|
|
|
num=num_queries,
|
|
|
|
stop=num_queries,
|
|
|
|
pause=0.2
|
|
|
|
)
|
|
|
|
for element in results:
|
|
|
|
while True:
|
|
|
|
try:
|
|
|
|
uuid = str(uuid4())
|
|
|
|
db.execute(
|
|
|
|
"""
|
|
|
|
INSERT INTO search_results (uuid, url)
|
|
|
|
VALUES (?, ?)
|
|
|
|
""",
|
|
|
|
(uuid, element,)
|
|
|
|
)
|
|
|
|
db.commit()
|
|
|
|
|
|
|
|
partitions.append(
|
|
|
|
{
|
|
|
|
"name": element.split("://")[1].split("/")[0],
|
|
|
|
"uuid": uuid,
|
|
|
|
"url": element
|
|
|
|
}
|
|
|
|
)
|
|
|
|
break
|
|
|
|
except db.IntegrityError:
|
|
|
|
pass
|
|
|
|
|
|
|
|
except urllib.error.URLError: # Unable to access network
|
|
|
|
return []
|
2023-08-28 14:14:57 +02:00
|
|
|
|
2024-01-16 18:50:19 +01:00
|
|
|
threads = [
|
|
|
|
threading.Thread(
|
|
|
|
target=download_search_result,
|
|
|
|
args=(elem, instance_path)
|
|
|
|
) for elem in partitions
|
|
|
|
]
|
2023-08-28 14:14:57 +02:00
|
|
|
|
|
|
|
for thread in threads:
|
|
|
|
thread.start()
|
|
|
|
|
|
|
|
for thread in threads:
|
|
|
|
thread.join()
|
|
|
|
|
2024-01-15 18:53:57 +01:00
|
|
|
for element in partitions.copy():
|
2023-08-28 14:14:57 +02:00
|
|
|
uuid = element["uuid"]
|
|
|
|
url = element["url"]
|
2024-01-16 18:50:19 +01:00
|
|
|
if os.stat(f"{instance_path}/search-partitions/{uuid}.pdf").st_size == 0:
|
2023-08-28 14:14:57 +02:00
|
|
|
print("An error occured", url)
|
|
|
|
db.execute(
|
|
|
|
"""
|
|
|
|
DELETE FROM search_results
|
|
|
|
WHERE uuid = ?
|
|
|
|
""",
|
|
|
|
(uuid,)
|
|
|
|
)
|
|
|
|
db.commit()
|
|
|
|
|
2024-01-16 18:50:19 +01:00
|
|
|
os.remove(f"{instance_path}/search-partitions/{uuid}.pdf")
|
2023-08-28 14:14:57 +02:00
|
|
|
|
|
|
|
partitions.remove(element)
|
|
|
|
|
2022-08-17 09:44:31 +02:00
|
|
|
return partitions
|
|
|
|
|
|
|
|
|
2024-01-16 18:50:19 +01:00
|
|
|
def flush_cache(instance_path):
|
2022-08-17 09:44:31 +02:00
|
|
|
"""
|
|
|
|
Supprimer les résultats de recherche datant de plus de 15 minutes
|
|
|
|
"""
|
|
|
|
db = get_db()
|
|
|
|
expired_cache = db.execute(
|
|
|
|
"""
|
2023-12-15 11:36:34 +01:00
|
|
|
SELECT uuid FROM search_results
|
2022-08-17 09:44:31 +02:00
|
|
|
WHERE creation_time <= datetime('now', '-15 minutes', 'localtime')
|
|
|
|
"""
|
|
|
|
).fetchall()
|
|
|
|
for element in expired_cache:
|
|
|
|
uuid = element["uuid"]
|
2024-01-16 21:00:14 +01:00
|
|
|
if os.path.exists(f"{instance_path}/search-partitions/{uuid}.pdf"):
|
2024-01-16 18:50:19 +01:00
|
|
|
os.remove(f"{instance_path}/search-partitions/{uuid}.pdf")
|
2024-01-16 21:00:14 +01:00
|
|
|
|
|
|
|
if os.path.exists(f"{instance_path}/cache/search-thumbnails/{uuid}.jpg"):
|
|
|
|
os.remove(f"{instance_path}/cache/search-thumbnails/{uuid}.jpg")
|
2022-08-17 09:44:31 +02:00
|
|
|
|
|
|
|
db.execute(
|
|
|
|
"""
|
|
|
|
DELETE FROM search_results
|
|
|
|
WHERE creation_time <= datetime('now', '-15 minutes', 'localtime')
|
|
|
|
"""
|
2022-12-19 15:19:58 +01:00
|
|
|
)
|
2023-12-15 11:36:34 +01:00
|
|
|
db.commit()
|