partitioncloud-server/partitioncloud/modules/search.py

173 lines
4.9 KiB
Python
Raw Normal View History

2022-08-16 18:13:00 +02:00
#!/usr/bin/python3
"""
Module implémentant la recherche de partitions par mots-clés
"""
2022-08-17 09:44:31 +02:00
from uuid import uuid4
import urllib.request
import threading
import socket
2022-08-17 09:44:31 +02:00
import os
2022-08-16 18:13:00 +02:00
2024-02-21 15:53:49 +01:00
import pypdf
2022-08-17 09:44:31 +02:00
import googlesearch
from unidecode import unidecode
2022-08-17 09:44:31 +02:00
from .db import get_db
socket.setdefaulttimeout(5) # Maximum time before we give up on downloading a file (dead url)
2022-08-17 09:44:31 +02:00
def local_search(query, partitions):
2022-08-16 18:13:00 +02:00
"""
Renvoie les 5 résultats les plus pertinents parmi une liste donnée
"""
query_words = [word.lower() for word in unidecode(query).split()]
2022-08-16 18:13:00 +02:00
def score_attribution(partition):
score = 0
for word in query_words:
2022-08-16 18:13:00 +02:00
if word != "":
if word in unidecode(partition["name"]).lower():
score += 6
elif word in unidecode(partition["author"]).lower():
score += 4
elif word in unidecode(partition["body"]).lower():
2022-08-16 18:13:00 +02:00
score += 2
else:
score -= 6
for word in unidecode(partition["name"]).split():
if word != "" and word.lower() not in query_words:
score -= 1
2022-08-16 18:13:00 +02:00
return score
score_partitions = [(score_attribution(partition), partition) for partition in partitions]
score_partitions.sort(key=lambda x: x[0], reverse=True)
2023-12-15 11:36:34 +01:00
selection = []
for score, partition in score_partitions[:5]:
if score > 0:
selection.append(partition)
2022-08-16 18:13:00 +02:00
else:
break
return selection
2022-08-17 09:44:31 +02:00
def download_search_result(element, instance_path):
uuid = element["uuid"]
url = element["url"]
2024-02-21 15:53:49 +01:00
filename = f"{instance_path}/search-partitions/{uuid}.pdf"
try:
2024-02-21 15:53:49 +01:00
urllib.request.urlretrieve(url, filename)
pypdf.PdfReader(filename)
2024-02-21 15:53:49 +01:00
except (urllib.error.HTTPError, urllib.error.URLError,
pypdf.errors.PdfReadError, pypdf.errors.PdfStreamError):
if os.path.exists(filename):
os.remove(filename)
with open(filename, 'a', encoding="utf8") as _:
pass # Create empty file
def online_search(query, num_queries, instance_path):
2022-08-17 09:44:31 +02:00
"""
Renvoie les 3 résultats les plus pertinents depuis google
2022-08-17 09:44:31 +02:00
"""
db = get_db()
2022-08-17 09:44:31 +02:00
query = f"partition filetype:pdf {query}"
partitions = []
try:
results = googlesearch.search(
query,
num=num_queries,
stop=num_queries,
pause=0.2
)
for element in results:
while True:
try:
uuid = str(uuid4())
db.execute(
"""
INSERT INTO search_results (uuid, url)
VALUES (?, ?)
""",
(uuid, element,)
)
db.commit()
partitions.append(
{
"name": element.split("://")[1].split("/")[0],
"uuid": uuid,
"url": element
}
)
break
except db.IntegrityError:
pass
except urllib.error.URLError: # Unable to access network
return []
threads = [
threading.Thread(
target=download_search_result,
args=(elem, instance_path)
) for elem in partitions
]
for thread in threads:
thread.start()
for thread in threads:
thread.join()
2024-01-15 18:53:57 +01:00
for element in partitions.copy():
uuid = element["uuid"]
url = element["url"]
if os.stat(f"{instance_path}/search-partitions/{uuid}.pdf").st_size == 0:
print("An error occured", url)
db.execute(
"""
DELETE FROM search_results
WHERE uuid = ?
""",
(uuid,)
)
db.commit()
os.remove(f"{instance_path}/search-partitions/{uuid}.pdf")
partitions.remove(element)
2022-08-17 09:44:31 +02:00
return partitions
def flush_cache(instance_path):
2022-08-17 09:44:31 +02:00
"""
Supprimer les résultats de recherche datant de plus de 15 minutes
"""
db = get_db()
expired_cache = db.execute(
"""
2023-12-15 11:36:34 +01:00
SELECT uuid FROM search_results
2022-08-17 09:44:31 +02:00
WHERE creation_time <= datetime('now', '-15 minutes', 'localtime')
"""
).fetchall()
for element in expired_cache:
uuid = element["uuid"]
2024-01-16 21:00:14 +01:00
if os.path.exists(f"{instance_path}/search-partitions/{uuid}.pdf"):
os.remove(f"{instance_path}/search-partitions/{uuid}.pdf")
2024-01-16 21:00:14 +01:00
if os.path.exists(f"{instance_path}/cache/search-thumbnails/{uuid}.jpg"):
os.remove(f"{instance_path}/cache/search-thumbnails/{uuid}.jpg")
2022-08-17 09:44:31 +02:00
db.execute(
"""
DELETE FROM search_results
WHERE creation_time <= datetime('now', '-15 minutes', 'localtime')
"""
2022-12-19 15:19:58 +01:00
)
2023-12-15 11:36:34 +01:00
db.commit()