From ae884cc04a526042f5a8b59fe5c396ef3317d45e Mon Sep 17 00:00:00 2001 From: augustin64 Date: Mon, 14 Oct 2024 13:33:01 +0200 Subject: [PATCH] Add custom model --- README.md | 18 +++++++++++++++++- src/generate_wordlist.py | 29 +++++++++++++++++++++++++++++ src/restrict_model.py | 26 ++++++++++++++++++++++++++ 3 files changed, 72 insertions(+), 1 deletion(-) create mode 100644 src/generate_wordlist.py create mode 100644 src/restrict_model.py diff --git a/README.md b/README.md index 32085dc..8bc7f50 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,19 @@ # [Cemantix](https://cemantix.certitudes.org/) CLI (fully local) -Download models at [Jean-Philippe Fauconnier's website](https://fauconnier.github.io/#data) (`frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin` recommanded) +# Basic use + +```bash +python cemantix.py -m +> # input your guess +> # help() to get an hint +> # clear() to remove words that are not useful +``` + +# Models + +Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data) + +Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`. +Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés. + +Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary) diff --git a/src/generate_wordlist.py b/src/generate_wordlist.py new file mode 100644 index 0000000..2b24370 --- /dev/null +++ b/src/generate_wordlist.py @@ -0,0 +1,29 @@ +""" +Generate a wordlist from https://github.com/hbenbel/French-Dictionary +""" +FRENCH_DICT = "French-Dictionary" # path to https://github.com/hbenbel/French-Dictionary downloaded repo +OUTPUT = f"{FRENCH_DICT}/output.txt" + +pos_to_keep = ['adj', 'adv', 'noun', 'prep', 'verb'] +banned = " -_'.:01234556789AZERTYUIOPQSDFGHJKLMWXCVBN" + +with open(OUTPUT, "w") as f: + f.write("") + +with open(OUTPUT, "a") as fop: + for file in pos_to_keep: + with open(f"{FRENCH_DICT}/dictionary/"+file+".csv", "r") as fp: + for line in fp: + l = line.split(",") + form, tags = l[0], ",".join(l[1:]) + if form == "form": + continue + + if file == "verb": + if "infinitive" not in tags: + continue + + if "plural" in tags or "feminine" in tags or len(form) <= 2 or any([i in form for i in banned]): + continue + + fop.write(f"{form}\n") diff --git a/src/restrict_model.py b/src/restrict_model.py new file mode 100644 index 0000000..532aabe --- /dev/null +++ b/src/restrict_model.py @@ -0,0 +1,26 @@ +from gensim.models import KeyedVectors + +BASE_MODEL = "models/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin" +WORD_LIST = "French-Dictionary/output.txt" +NEW_MODEL = "models/selected_word2vec_model.bin" + +model = KeyedVectors.load_word2vec_format( + BASE_MODEL, + binary=True, + unicode_errors="ignore" +) + +with open(WORD_LIST, "r") as f: + selected_words = f.read().split("\n") +selected_words = selected_words[:-1] + +new_model = KeyedVectors(vector_size=model.vector_size) +intersect = [] +for word in selected_words: + if word in model: + intersect.append(word) + +new_model.add_vectors(intersect, [model[word] for word in intersect]) + +new_model.fill_norms() +new_model.save_word2vec_format(NEW_MODEL, binary=True)