Compare commits

..

No commits in common. "main" and "v1.0.0" have entirely different histories.
main ... v1.0.0

3 changed files with 57 additions and 276 deletions

View File

@ -3,22 +3,17 @@
# Basic use # Basic use
```bash ```bash
python src/cemantix.py -m <model.bin> python cemantix.py -m <model.bin>
> # input your guess > # input your guess
> # help() to get an hint > # help() to get an hint
> # clear() to remove words that are not useful > # clear() to remove words that are not useful
``` ```
Additional arguments:
- `-w` `--word` specify an objective word (local only)
- `-r` `--remote` use [cemantix.certitudes.org](https://cemantix.certitudes.org) instead of local server
- `-s` `--solver` solve automatically instead of asking user to do it
# Models # Models
Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data) Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data)
Les modèles recommandés sont disponibles [ici](https://gitea.augustin64.fr/augustin64/cemantix-cli/releases/tag/v1.0.0/). Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`.
Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés. Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés.
Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary) Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary)

View File

@ -1,3 +1,2 @@
colorama colorama
gensim gensim
requests

View File

@ -1,163 +1,39 @@
from dataclasses import dataclass
from gensim.models import KeyedVectors from gensim.models import KeyedVectors
from colorama import Fore, Back, Style from colorama import Fore, Back, Style
import argparse import argparse
import requests
import readline import readline
import random import random
import json
import time import time
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
@dataclass def random_word(model, k=5, dist=100):
class Command: base_words = [
word: str model.index_to_key[random.randint(0, len(model))]
for _ in range(k)
]
@dataclass complete_list = base_words.copy()
class Guess: for word in base_words:
word: str complete_list += [i[0] for i in model.most_similar(word, topn=dist)]
Input = Command | Guess rk_words = model.rank_by_centrality(complete_list)
return rk_words[random.randint(0,5)][1]
class Server:
inverse_order = False
def __init__(self):
pass
def init_word(self):
pass
def get_rank(self, guess):
pass
def get_temp(self, guess):
pass
def _help(self, rk):
raise NotImplementedError
def _reveal_word(self):
raise NotImplementedError
class LocalServer(Server):
inverse_order = False
def __init__(self, word=None, file="models/selected_word2vec_model.bin"):
self.model = KeyedVectors.load_word2vec_format(
file,
binary=True,
unicode_errors="ignore"
)
self.word = word
self.nearest = []
def init_word(self, k=1, dist=100): def cemantix(model, word=None):
while (self.word is None or len(self.word) < 5 while word is None or len(word) < 5 or '-' in word or '_' in word:
or '-' in self.word or '_' in self.word): word = random_word(model, k=1, dist=1) # augment numbers to try a "smooth selection"
base_words = [
self.model.index_to_key[random.randint(0, len(self.model))]
for _ in range(k)
]
if None in base_words:
continue
complete_list = base_words.copy() nearest = [word]+[i[0] for i in model.most_similar(word, topn=1000)]
for word in base_words: guesses = [] # guess, temp, rank
complete_list += [i[0] for i in self.model.most_similar(word, topn=dist)] def get_rank(guess):
if guess not in nearest:
rk_words = self.model.rank_by_centrality(complete_list)
self.word = rk_words[random.randint(0,5)%len(rk_words)][1]
self.nearest = [self.word]+[i[0] for i in self.model.most_similar(self.word, topn=1000)]
def get_rank(self, guess):
if guess not in self.nearest:
return None return None
return 1000 - self.nearest.index(guess) return 1000 - nearest.index(guess)
def get_temp(self, guess): def formatted_status(guesses, last=None):
return round(100*(1-self.model.distance(self.word, guess)), 2)
def _help(self, rk):
return self.nearest[rk]
def _reveal_word(self):
return self.word
class CemantixServer(Server):
def __init__(self):
self.words = {}
self.solvers = None
self.num = None
def _try(self, word):
url = 'https://cemantix.certitudes.org/score'
headers = {"Origin": "https://cemantix.certitudes.org"}
data = {'word': word}
# Need some additional cookies to be a valid client
r = requests.post(url, headers=headers, data=data)
assert r.ok
data = json.loads(r.text)
self.solvers = int(data["solvers"])
self.num = int(data["num"])
self.words[word] = {"score" : float(data["score"])}
if "percentile" in data:
self.words[word]["percentile"] = data["percentile"]
def get_rank(self, guess):
if guess not in self.words:
self._try(guess)
return self.words[guess].get("percentile", None)
def get_temp(self, guess):
if guess not in self.words:
self._try(guess)
return self.words[guess]["score"]*100
class Client:
inverse_order = False
def __init__(self):
pass
def guess(self) -> str:
raise NotImplementedError
def answer_guess(self, word, temp, dist):
raise NotImplementedError
def correct(self, word):
pass
def reveal(self, word):
pass
def unknow_word(self, word):
print(Fore.RED+"Key not present"+Style.RESET_ALL)
def _clear(self):
pass
def _help(self, word):
pass
def _best_rank(self):
raise NotImplementedError
class UserClient(Client):
def __init__(self):
self.guesses = [] # guess, temp, rank
def formatted_status(self, last=None):
text = "" text = ""
for w, temp, rank in self.guesses: for w, temp, rank in guesses:
if rank is not None: if rank is not None:
text += Back.RED+Fore.BLACK+Style.BRIGHT+str(rank)+Style.RESET_ALL text += Back.RED+Fore.BLACK+Style.BRIGHT+str(rank)+Style.RESET_ALL
text += "\t" text += "\t"
@ -169,133 +45,48 @@ class UserClient(Client):
text += w + Style.RESET_ALL+"\n" text += w + Style.RESET_ALL+"\n"
return text[:-1] return text[:-1]
def guess(self): def tried(word, guessed):
try: return word in [i[0] for i in guessed]
guess = input(Style.BRIGHT+"Your guess > "+Style.RESET_ALL).strip()
readline.add_history(guess)
except (EOFError, KeyboardInterrupt):
raise EOFError
if guess.endswith("()"): def interpret_command(cmd, guesses):
return Command(guess[:-2])
return Guess(guess)
def answer_guess(self, guess, dist, rank):
if guess not in (i[0] for i in self.guesses):
self.guesses.append((guess, dist, rank))
self.guesses.sort(key=lambda x:-x[1] if self.inverse_order else x[1])
print(chr(27) + "[2J")
print(self.formatted_status(last=guess))
def reveal(self, word):
print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
def correct(self, guess):
time.sleep(1)
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {len(self.guesses)} tries.")
return len(self.guesses)
def _clear(self):
self.guesses = [g for g in self.guesses if g[1] <= 75.]
def _best_rank(self):
return max([rk for _, _, rk in self.guesses if rk is not None]+[749])
def _help(self, word):
print("Maybe try "+Back.YELLOW+Fore.BLACK+word+Style.RESET_ALL)
class AutoClient(Client):
def __init__(self, file="models/selected_word2vec_model.bin"):
self.model = KeyedVectors.load_word2vec_format(
file,
binary=True,
unicode_errors="ignore"
)
self.dictionary = [
self.model.index_to_key[i] for i in range(len(self.model))
if self.model.index_to_key[i] is not None
]
self.num_guesses = 0
def guess(self):
if len(self.dictionary) == 0:
raise EOFError
if len(self.dictionary) < 20:
return Guess(random.choice(self.dictionary))
pos = -int(len(self.dictionary)/10)
return Guess(random.choice(
[w[1] for w in self.model.rank_by_centrality(self.dictionary)[pos:]]
))
def answer_guess(self, guess, dist, rank):
def cem(score):
return 100*(1-score)
self.num_guesses += 1
old_count = len(self.dictionary)
self.dictionary.remove(guess)
self.dictionary = [
w for w in self.dictionary if abs(cem(self.model.distance(guess, w)) - dist) <= 0.01
]
print(f"[{self.num_guesses}] Guessing {guess}: {old_count} => {len(self.dictionary)}")
def reveal(self, word):
print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
def correct(self, guess):
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {self.num_guesses} tries.")
return self.num_guesses
def unknow_word(self, word):
print(Fore.RED+"Key not present"+Style.RESET_ALL)
self.dictionary.remove(word)
def cemantix(server: Server, client: Client):
server.init_word()
client.inverse_order = server.inverse_order
def interpret_command(cmd):
match cmd: match cmd:
case "clear": case "clear":
client._clear() guesses = [g for g in guesses if g[1] <= 75.]
case "help": case "help":
try: best_rk = max([rk for _, _, rk in guesses if rk is not None]+[749])
client._help(server._help(999-client._best_rank())) print("Maybe try "+Back.YELLOW+Fore.BLACK+nearest[999-best_rk]+Style.RESET_ALL)
except NotImplementedError:
print(Fore.RED+"No help available"+Style.RESET_ALL)
case _: case _:
print(Fore.RED+"Unknown command"+Style.RESET_ALL) print(Fore.RED+"Unknown command"+Style.RESET_ALL)
return guesses
while True: while True:
guess = None
try: try:
match client.guess(): guess = input(Style.BRIGHT+"Your guess > "+Style.RESET_ALL).strip()
case Command(cmd): readline.add_history(guess)
interpret_command(cmd) if guess.endswith("()"):
continue guesses = interpret_command(guess[:-2], guesses)
case Guess(word): continue
guess = word except (EOFError, KeyboardInterrupt):
except EOFError: print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
try:
client.reveal(server._reveal_word())
except NotImplementedError:
pass
print("Goodbye!") print("Goodbye!")
return -1 return -1
try: try:
dist = server.get_temp(guess) dist = round(round(model.distance(word, guess), 4)*100, 2)
except KeyError: except KeyError:
client.unknow_word(guess) print(Fore.RED+"Key not present"+Style.RESET_ALL)
continue continue
rank = server.get_rank(guess) if not tried(guess, guesses):
client.answer_guess(guess, dist, rank) guesses.append((guess, dist, get_rank(guess)))
if rank == 1000: guesses.sort(key=lambda x:-x[1])
client.correct(guess) print(chr(27) + "[2J")
return guess print(formatted_status(guesses, last=guess))
if guess == word:
time.sleep(1)
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {len(guesses)} tries.")
return len(guesses)
@ -305,20 +96,16 @@ def main():
parser.add_argument("-w", "--word", dest="word", action="store", parser.add_argument("-w", "--word", dest="word", action="store",
help="Specify goal word") help="Specify goal word")
parser.add_argument("-m", "--model", dest="model", action="store", parser.add_argument("-m", "--model", dest="model", action="store",
default="models/selected_word2vec_model.bin", default="frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin",
help="Specify model to use") help="Specify model to use")
parser.add_argument("-r", "--remote", dest="remote", action="store_true",
default=False,
help="Use cemantix.certitudes.org instead of local model")
parser.add_argument("-s", "--solver", dest="auto_solver", action="store_true",
default=False,
help="Use auto solver")
args = parser.parse_args() args = parser.parse_args()
client = UserClient() if not args.auto_solver else AutoClient(file=args.model) model = KeyedVectors.load_word2vec_format(
server = LocalServer(word=args.word, file=args.model) if not args.remote else CemantixServer() args.model,
return cemantix(server, client) binary=True,
unicode_errors="ignore"
)
cemantix(model, word=args.word)
if __name__ == "__main__": if __name__ == "__main__":
main() main()