Add custom model

2024-10-14 13:33:01 +02:00 · 2024-10-14 13:33:01 +02:00 · ae884cc04a
commit ae884cc04a
parent 86cc497a99
3 changed files with 72 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -1,3 +1,19 @@
 # [Cemantix](https://cemantix.certitudes.org/) CLI (fully local)
-Download models at [Jean-Philippe Fauconnier's website](https://fauconnier.github.io/#data) (`frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin` recommanded)
+# Basic use
 ```bash
 python cemantix.py -m <model.bin>
 > # input your guess
 > # help() to get an hint
 > # clear() to remove words that are not useful
 ```
 # Models
 Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data)
 Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`.
 Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés.
 Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary)
--- a/src/generate_wordlist.py
+++ b/src/generate_wordlist.py
@ -0,0 +1,29 @@
 """
 Generate a wordlist from https://github.com/hbenbel/French-Dictionary
 """
 FRENCH_DICT = "French-Dictionary" # path to https://github.com/hbenbel/French-Dictionary downloaded repo
 OUTPUT = f"{FRENCH_DICT}/output.txt"
 pos_to_keep = ['adj', 'adv', 'noun', 'prep', 'verb']
 banned = " -_'.:01234556789AZERTYUIOPQSDFGHJKLMWXCVBN"
 with open(OUTPUT, "w") as f:
    f.write("")
 with open(OUTPUT, "a") as fop:
    for file in pos_to_keep:
        with open(f"{FRENCH_DICT}/dictionary/"+file+".csv", "r") as fp:
            for line in fp:
                l = line.split(",")
                form, tags = l[0], ",".join(l[1:])
                if form == "form":
                    continue
                if file == "verb":
                    if "infinitive" not in tags:
                        continue
                if "plural" in tags or "feminine" in tags or len(form) <= 2 or any([i in form for i in banned]):
                    continue
                fop.write(f"{form}\n")
--- a/src/restrict_model.py
+++ b/src/restrict_model.py
@ -0,0 +1,26 @@
 from gensim.models import KeyedVectors
 BASE_MODEL = "models/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin"
 WORD_LIST = "French-Dictionary/output.txt"
 NEW_MODEL = "models/selected_word2vec_model.bin"
 model = KeyedVectors.load_word2vec_format(
    BASE_MODEL,
    binary=True,
    unicode_errors="ignore"
 )
 with open(WORD_LIST, "r") as f:
 	selected_words = f.read().split("\n")
 selected_words = selected_words[:-1]
 new_model = KeyedVectors(vector_size=model.vector_size)
 intersect = []
 for word in selected_words:
    if word in model:
        intersect.append(word)
 new_model.add_vectors(intersect, [model[word] for word in intersect])
 new_model.fill_norms()
 new_model.save_word2vec_format(NEW_MODEL, binary=True)