Compare commits

..

No commits in common. "main" and "v1.0.0" have entirely different histories.
main ... v1.0.0

3 changed files with 57 additions and 276 deletions

View File

@ -3,22 +3,17 @@
# Basic use
```bash
python src/cemantix.py -m <model.bin>
python cemantix.py -m <model.bin>
> # input your guess
> # help() to get an hint
> # clear() to remove words that are not useful
```
Additional arguments:
- `-w` `--word` specify an objective word (local only)
- `-r` `--remote` use [cemantix.certitudes.org](https://cemantix.certitudes.org) instead of local server
- `-s` `--solver` solve automatically instead of asking user to do it
# Models
Les modèles de base viennent du [site de Jean-Philippe Fauconnier](https://fauconnier.github.io/#data)
Les modèles recommandés sont disponibles [ici](https://gitea.augustin64.fr/augustin64/cemantix-cli/releases/tag/v1.0.0/).
Le modèle recommandé est disponible [ici]() et est une modification de `frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin`.
Les mots qui ne sont pas dans le dictionnaire, les verbes conjugués et autres ont été supprimés.
Il est possible de recréer ce modèle avec `src/generate_wordlist.py` et `src/restrict_model.py` et [ce dictionnaire français](https://github.com/hbenbel/French-Dictionary)

View File

@ -1,3 +1,2 @@
colorama
gensim
requests

View File

@ -1,163 +1,39 @@
from dataclasses import dataclass
from gensim.models import KeyedVectors
from colorama import Fore, Back, Style
import argparse
import requests
import readline
import random
import json
import time
import numpy as np
np.seterr(divide='ignore', invalid='ignore')
@dataclass
class Command:
word: str
def random_word(model, k=5, dist=100):
base_words = [
model.index_to_key[random.randint(0, len(model))]
for _ in range(k)
]
@dataclass
class Guess:
word: str
complete_list = base_words.copy()
for word in base_words:
complete_list += [i[0] for i in model.most_similar(word, topn=dist)]
Input = Command | Guess
class Server:
inverse_order = False
def __init__(self):
pass
def init_word(self):
pass
def get_rank(self, guess):
pass
def get_temp(self, guess):
pass
def _help(self, rk):
raise NotImplementedError
def _reveal_word(self):
raise NotImplementedError
class LocalServer(Server):
inverse_order = False
def __init__(self, word=None, file="models/selected_word2vec_model.bin"):
self.model = KeyedVectors.load_word2vec_format(
file,
binary=True,
unicode_errors="ignore"
)
self.word = word
self.nearest = []
rk_words = model.rank_by_centrality(complete_list)
return rk_words[random.randint(0,5)][1]
def init_word(self, k=1, dist=100):
while (self.word is None or len(self.word) < 5
or '-' in self.word or '_' in self.word):
base_words = [
self.model.index_to_key[random.randint(0, len(self.model))]
for _ in range(k)
]
if None in base_words:
continue
def cemantix(model, word=None):
while word is None or len(word) < 5 or '-' in word or '_' in word:
word = random_word(model, k=1, dist=1) # augment numbers to try a "smooth selection"
complete_list = base_words.copy()
for word in base_words:
complete_list += [i[0] for i in self.model.most_similar(word, topn=dist)]
rk_words = self.model.rank_by_centrality(complete_list)
self.word = rk_words[random.randint(0,5)%len(rk_words)][1]
self.nearest = [self.word]+[i[0] for i in self.model.most_similar(self.word, topn=1000)]
def get_rank(self, guess):
if guess not in self.nearest:
nearest = [word]+[i[0] for i in model.most_similar(word, topn=1000)]
guesses = [] # guess, temp, rank
def get_rank(guess):
if guess not in nearest:
return None
return 1000 - self.nearest.index(guess)
return 1000 - nearest.index(guess)
def get_temp(self, guess):
return round(100*(1-self.model.distance(self.word, guess)), 2)
def _help(self, rk):
return self.nearest[rk]
def _reveal_word(self):
return self.word
class CemantixServer(Server):
def __init__(self):
self.words = {}
self.solvers = None
self.num = None
def _try(self, word):
url = 'https://cemantix.certitudes.org/score'
headers = {"Origin": "https://cemantix.certitudes.org"}
data = {'word': word}
# Need some additional cookies to be a valid client
r = requests.post(url, headers=headers, data=data)
assert r.ok
data = json.loads(r.text)
self.solvers = int(data["solvers"])
self.num = int(data["num"])
self.words[word] = {"score" : float(data["score"])}
if "percentile" in data:
self.words[word]["percentile"] = data["percentile"]
def get_rank(self, guess):
if guess not in self.words:
self._try(guess)
return self.words[guess].get("percentile", None)
def get_temp(self, guess):
if guess not in self.words:
self._try(guess)
return self.words[guess]["score"]*100
class Client:
inverse_order = False
def __init__(self):
pass
def guess(self) -> str:
raise NotImplementedError
def answer_guess(self, word, temp, dist):
raise NotImplementedError
def correct(self, word):
pass
def reveal(self, word):
pass
def unknow_word(self, word):
print(Fore.RED+"Key not present"+Style.RESET_ALL)
def _clear(self):
pass
def _help(self, word):
pass
def _best_rank(self):
raise NotImplementedError
class UserClient(Client):
def __init__(self):
self.guesses = [] # guess, temp, rank
def formatted_status(self, last=None):
def formatted_status(guesses, last=None):
text = ""
for w, temp, rank in self.guesses:
for w, temp, rank in guesses:
if rank is not None:
text += Back.RED+Fore.BLACK+Style.BRIGHT+str(rank)+Style.RESET_ALL
text += "\t"
@ -168,134 +44,49 @@ class UserClient(Client):
text += Style.BRIGHT+Back.WHITE+Fore.BLACK
text += w + Style.RESET_ALL+"\n"
return text[:-1]
def guess(self):
try:
guess = input(Style.BRIGHT+"Your guess > "+Style.RESET_ALL).strip()
readline.add_history(guess)
except (EOFError, KeyboardInterrupt):
raise EOFError
if guess.endswith("()"):
return Command(guess[:-2])
return Guess(guess)
def tried(word, guessed):
return word in [i[0] for i in guessed]
def answer_guess(self, guess, dist, rank):
if guess not in (i[0] for i in self.guesses):
self.guesses.append((guess, dist, rank))
self.guesses.sort(key=lambda x:-x[1] if self.inverse_order else x[1])
print(chr(27) + "[2J")
print(self.formatted_status(last=guess))
def reveal(self, word):
print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
def correct(self, guess):
time.sleep(1)
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {len(self.guesses)} tries.")
return len(self.guesses)
def _clear(self):
self.guesses = [g for g in self.guesses if g[1] <= 75.]
def _best_rank(self):
return max([rk for _, _, rk in self.guesses if rk is not None]+[749])
def _help(self, word):
print("Maybe try "+Back.YELLOW+Fore.BLACK+word+Style.RESET_ALL)
class AutoClient(Client):
def __init__(self, file="models/selected_word2vec_model.bin"):
self.model = KeyedVectors.load_word2vec_format(
file,
binary=True,
unicode_errors="ignore"
)
self.dictionary = [
self.model.index_to_key[i] for i in range(len(self.model))
if self.model.index_to_key[i] is not None
]
self.num_guesses = 0
def guess(self):
if len(self.dictionary) == 0:
raise EOFError
if len(self.dictionary) < 20:
return Guess(random.choice(self.dictionary))
pos = -int(len(self.dictionary)/10)
return Guess(random.choice(
[w[1] for w in self.model.rank_by_centrality(self.dictionary)[pos:]]
))
def answer_guess(self, guess, dist, rank):
def cem(score):
return 100*(1-score)
self.num_guesses += 1
old_count = len(self.dictionary)
self.dictionary.remove(guess)
self.dictionary = [
w for w in self.dictionary if abs(cem(self.model.distance(guess, w)) - dist) <= 0.01
]
print(f"[{self.num_guesses}] Guessing {guess}: {old_count} => {len(self.dictionary)}")
def reveal(self, word):
print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
def correct(self, guess):
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {self.num_guesses} tries.")
return self.num_guesses
def unknow_word(self, word):
print(Fore.RED+"Key not present"+Style.RESET_ALL)
self.dictionary.remove(word)
def cemantix(server: Server, client: Client):
server.init_word()
client.inverse_order = server.inverse_order
def interpret_command(cmd):
def interpret_command(cmd, guesses):
match cmd:
case "clear":
client._clear()
guesses = [g for g in guesses if g[1] <= 75.]
case "help":
try:
client._help(server._help(999-client._best_rank()))
except NotImplementedError:
print(Fore.RED+"No help available"+Style.RESET_ALL)
best_rk = max([rk for _, _, rk in guesses if rk is not None]+[749])
print("Maybe try "+Back.YELLOW+Fore.BLACK+nearest[999-best_rk]+Style.RESET_ALL)
case _:
print(Fore.RED+"Unknown command"+Style.RESET_ALL)
return guesses
while True:
guess = None
try:
match client.guess():
case Command(cmd):
interpret_command(cmd)
continue
case Guess(word):
guess = word
except EOFError:
try:
client.reveal(server._reveal_word())
except NotImplementedError:
pass
guess = input(Style.BRIGHT+"Your guess > "+Style.RESET_ALL).strip()
readline.add_history(guess)
if guess.endswith("()"):
guesses = interpret_command(guess[:-2], guesses)
continue
except (EOFError, KeyboardInterrupt):
print("The word was "+Style.BRIGHT+word+Style.RESET_ALL)
print("Goodbye!")
return -1
try:
dist = server.get_temp(guess)
dist = round(round(model.distance(word, guess), 4)*100, 2)
except KeyError:
client.unknow_word(guess)
print(Fore.RED+"Key not present"+Style.RESET_ALL)
continue
rank = server.get_rank(guess)
client.answer_guess(guess, dist, rank)
if rank == 1000:
client.correct(guess)
return guess
if not tried(guess, guesses):
guesses.append((guess, dist, get_rank(guess)))
guesses.sort(key=lambda x:-x[1])
print(chr(27) + "[2J")
print(formatted_status(guesses, last=guess))
if guess == word:
time.sleep(1)
print(Fore.GREEN+"Correct!"+Style.RESET_ALL+f" {len(guesses)} tries.")
return len(guesses)
@ -305,20 +96,16 @@ def main():
parser.add_argument("-w", "--word", dest="word", action="store",
help="Specify goal word")
parser.add_argument("-m", "--model", dest="model", action="store",
default="models/selected_word2vec_model.bin",
default="frWac_non_lem_no_postag_no_phrase_200_skip_cut100.bin",
help="Specify model to use")
parser.add_argument("-r", "--remote", dest="remote", action="store_true",
default=False,
help="Use cemantix.certitudes.org instead of local model")
parser.add_argument("-s", "--solver", dest="auto_solver", action="store_true",
default=False,
help="Use auto solver")
args = parser.parse_args()
client = UserClient() if not args.auto_solver else AutoClient(file=args.model)
server = LocalServer(word=args.word, file=args.model) if not args.remote else CemantixServer()
return cemantix(server, client)
model = KeyedVectors.load_word2vec_format(
args.model,
binary=True,
unicode_errors="ignore"
)
cemantix(model, word=args.word)
if __name__ == "__main__":
main()