Add custom model
This commit is contained in:
parent
86cc497a99
commit
ae884cc04a
18
README.md
18
README.md
@ -1,3 +1,19 @@
|
|||||||
# [Cemantix](https://cemantix.certitudes.org/) CLI (fully local)
|
# [Cemantix](https://cemantix.certitudes.org/) CLI (fully local)
|
||||||
|
|
||||||
Download models at [Jean-Philippe Fauconnier's website](https://fauconnier.github.io/#data) (`frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin` recommanded)
|
# Basic use
|
||||||
|
|
||||||
|
```bash
|
||||||
|
python cemantix.py -m <model.bin>
|
||||||
|
> # input your guess
|
||||||
|
> # help() to get an hint
|
||||||
|
> # clear() to remove words that are not useful
|
||||||
|
```
|
||||||
|
|
||||||
|
# Models
|
||||||
|
|
||||||
|
Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data)
|
||||||
|
|
||||||
|
Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`.
|
||||||
|
Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés.
|
||||||
|
|
||||||
|
Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary)
|
||||||
|
29
src/generate_wordlist.py
Normal file
29
src/generate_wordlist.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
"""
|
||||||
|
Generate a wordlist from https://github.com/hbenbel/French-Dictionary
|
||||||
|
"""
|
||||||
|
FRENCH_DICT = "French-Dictionary" # path to https://github.com/hbenbel/French-Dictionary downloaded repo
|
||||||
|
OUTPUT = f"{FRENCH_DICT}/output.txt"
|
||||||
|
|
||||||
|
pos_to_keep = ['adj', 'adv', 'noun', 'prep', 'verb']
|
||||||
|
banned = " -_'.:01234556789AZERTYUIOPQSDFGHJKLMWXCVBN"
|
||||||
|
|
||||||
|
with open(OUTPUT, "w") as f:
|
||||||
|
f.write("")
|
||||||
|
|
||||||
|
with open(OUTPUT, "a") as fop:
|
||||||
|
for file in pos_to_keep:
|
||||||
|
with open(f"{FRENCH_DICT}/dictionary/"+file+".csv", "r") as fp:
|
||||||
|
for line in fp:
|
||||||
|
l = line.split(",")
|
||||||
|
form, tags = l[0], ",".join(l[1:])
|
||||||
|
if form == "form":
|
||||||
|
continue
|
||||||
|
|
||||||
|
if file == "verb":
|
||||||
|
if "infinitive" not in tags:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if "plural" in tags or "feminine" in tags or len(form) <= 2 or any([i in form for i in banned]):
|
||||||
|
continue
|
||||||
|
|
||||||
|
fop.write(f"{form}\n")
|
26
src/restrict_model.py
Normal file
26
src/restrict_model.py
Normal file
@ -0,0 +1,26 @@
|
|||||||
|
from gensim.models import KeyedVectors
|
||||||
|
|
||||||
|
BASE_MODEL = "models/frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin"
|
||||||
|
WORD_LIST = "French-Dictionary/output.txt"
|
||||||
|
NEW_MODEL = "models/selected_word2vec_model.bin"
|
||||||
|
|
||||||
|
model = KeyedVectors.load_word2vec_format(
|
||||||
|
BASE_MODEL,
|
||||||
|
binary=True,
|
||||||
|
unicode_errors="ignore"
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(WORD_LIST, "r") as f:
|
||||||
|
selected_words = f.read().split("\n")
|
||||||
|
selected_words = selected_words[:-1]
|
||||||
|
|
||||||
|
new_model = KeyedVectors(vector_size=model.vector_size)
|
||||||
|
intersect = []
|
||||||
|
for word in selected_words:
|
||||||
|
if word in model:
|
||||||
|
intersect.append(word)
|
||||||
|
|
||||||
|
new_model.add_vectors(intersect, [model[word] for word in intersect])
|
||||||
|
|
||||||
|
new_model.fill_norms()
|
||||||
|
new_model.save_word2vec_format(NEW_MODEL, binary=True)
|
Loading…
Reference in New Issue
Block a user