Add custom model

Create dirs
2024-10-14 13:33:01 +02:00 · 2024-10-14 12:02:51 +02:00
5 changed files with 72 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,19 @@
 # [Cemantix](https://cemantix.certitudes.org/) CLI (fully local)

-Download models at [Jean-Philippe Fauconnier's website](https://fauconnier.github.io/#data) (`frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin` recommanded)
+# Basic use
+
+```bash
+python cemantix.py -m <model.bin>
+> # input your guess
+> # help() to get an hint
+> # clear() to remove words that are not useful
+```
+
+# Models
+
+Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data)
+
+Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`.
+Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés.
+
+Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary)
--- a/models/.gitkeep
+++ b/models/.gitkeep
--- a/src/cemantix.py
+++ b/src/cemantix.py
--- a/src/generate_wordlist.py
+++ b/src/generate_wordlist.py
@ -0,0 +1,29 @@
+"""
+Generate a wordlist from https://github.com/hbenbel/French-Dictionary
+"""
+FRENCH_DICT = "French-Dictionary" # path to https://github.com/hbenbel/French-Dictionary downloaded repo
+OUTPUT = f"{FRENCH_DICT}/output.txt"
+
+pos_to_keep = ['adj', 'adv', 'noun', 'prep', 'verb']
+banned = " -_'.:01234556789AZERTYUIOPQSDFGHJKLMWXCVBN"
+
+with open(OUTPUT, "w") as f:
+    f.write("")
+
+with open(OUTPUT, "a") as fop:
+    for file in pos_to_keep:
+        with open(f"{FRENCH_DICT}/dictionary/"+file+".csv", "r") as fp:
+            for line in fp:
+                l = line.split(",")
+                form, tags = l[0], ",".join(l[1:])
+                if form == "form":
+                    continue
+
+                if file == "verb":
+                    if "infinitive" not in tags:
+                        continue
+
+                if "plural" in tags or "feminine" in tags or len(form) <= 2 or any([i in form for i in banned]):
+                    continue
+
+                fop.write(f"{form}\n")
--- a/src/restrict_model.py
+++ b/src/restrict_model.py
@ -0,0 +1,26 @@
+from gensim.models import KeyedVectors
+
+BASE_MODEL = "models/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin"
+WORD_LIST = "French-Dictionary/output.txt"
+NEW_MODEL = "models/selected_word2vec_model.bin"
+
+model = KeyedVectors.load_word2vec_format(
+    BASE_MODEL,
+    binary=True,
+    unicode_errors="ignore"
+)
+
+with open(WORD_LIST, "r") as f:
+	selected_words = f.read().split("\n")
+selected_words = selected_words[:-1]
+
+new_model = KeyedVectors(vector_size=model.vector_size)
+intersect = []
+for word in selected_words:
+    if word in model:
+        intersect.append(word)
+
+new_model.add_vectors(intersect, [model[word] for word in intersect])
+
+new_model.fill_norms()
+new_model.save_word2vec_format(NEW_MODEL, binary=True)
Author	SHA1	Message	Date
augustin64	ae884cc04a	Add custom model	2024-10-14 13:33:01 +02:00
augustin64	86cc497a99	Create dirs	2024-10-14 12:02:51 +02:00