In [20]:
import json
import nltk
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import words
nltk.download('words')
ww = words.words()
import pandas as pd 
import string

[nltk_data] Downloading package wordnet to /home/sy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package words to /home/sy/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [21]:
def format(s):
    return ' '.join(s.split('_'))
from collections import defaultdict
lexnames = set()
lexname_blacklist = {
    'noun.plant',
    'noun.animal',
    'noun.person',
    'noun.state',
    'noun.body',
    #'noun.location',
    #'noun.group',
}
def is_proper(w):
    return w[0] in string.ascii_uppercase
def groups_for_pos(pos='as'):
    dsynonyms = []
    for word in wn.words():
        if is_proper(word):
            continue
        synsets = wn.synsets(word)
        synonymss = wn.synonyms(word)
        syns = set()
        for synset, synonyms in zip(synsets, synonymss):
            if synset.lexname() in lexname_blacklist:
                continue
            defn = synset.definition()
            if synset.pos() in pos:
                syns |= set((syn, defn) for syn in synonyms if not is_proper(syn))
        if len(syns) >= 4:
            clues = [(format(clue), defn) for (clue, defn) in syns]
            #clues.append(format(word))
            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))
    return dsynonyms

In [22]:
dadj = groups_for_pos('as')
dverb = groups_for_pos('v')
dnoun = groups_for_pos('n')
dadverb = groups_for_pos('r')

In [23]:
def find_def(w):
    synsets = wn.synsets(w)
    if not synsets:
        return 'no definition provided'
    return synsets[0].definition()

df = pd.read_csv('LADECv1-2019.csv', index_col=0)
df = df.drop(df[df.correctParse == 'no'].index)
df = df.drop(df[df.isCommonstim == 0].index)
prefixes = df.groupby('c1').groups
suffixes = df.groupby('c2').groups
dprefix = []
for prefix, ids in prefixes.items():
    if len(ids) >= 4:
        clues = list(df.loc[list(ids)].c2)
        cluedefs = [(clue, find_def(prefix+clue)) for clue in clues]
        dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=cluedefs))
dsuffix = []
for suffix, ids in suffixes.items():
    if len(ids) >= 4:
        clues = list(df.loc[list(ids)].c1)
        cluedefs = [(clue, find_def(clue+suffix)) for clue in clues]
        dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=cluedefs))

In [24]:
from Levenshtein import ratio
def similar(a, b):
    a, b = a.lower(), b.lower()
    if len(a) > len(b):
        a, b = b, a
    if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b:
        return True
    r = ratio(a, b)
    if .8 <= r <= .9:
        pass
        #print(a, b, r)
    return r >= .85
def filter_duplicates(group):
    if not group:
        return []
    ok = [group[0]]
    for i in range(1, len(group)):
        for word in ok:
            if similar(word[0], group[i][0]):
                break
        else:
            ok.append(group[i])
    return ok

In [25]:
blacklist = ['man', 'men', 'woman', 'women']
def process_groups(groups):
    new = []
    for group in groups:
        clues = group['clues']
        clues = [(clue, defn) for (clue, defn) in clues if clue not in blacklist]
        clues = filter_duplicates(clues)
        if len(clues) < 4:
            continue
        new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))
    return new
corpus = [
    dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),
    dict(name='prefixes', groups=process_groups(dprefix), portion=.8),
    dict(name='verbs', groups=process_groups(dverb), portion=.6),
    dict(name='adverbs', groups=process_groups(dadverb), portion=.54),
    dict(name='nouns', groups=process_groups(dnoun), portion=.2),
    dict(name='adjectives', groups=process_groups(dadj), portion=0),
]

In [42]:
word_corpus = json.loads(json.dumps(corpus))
for i, groups in enumerate(corpus):
    for j, group in enumerate(groups['groups']):
        for k, clue in enumerate(group['clues']):
            full_word = clue[0]
            if groups['name'] == 'suffixes':
                full_word = clue[0] + group['answer']
            elif groups['name'] == 'prefixes':
                full_word = group['answer'] + clue[0]
            word_corpus[i]['groups'][j]['clues'][k] = [clue[0], [i,j,k]]
            corpus[i]['groups'][j]['clues'][k] = corpus[i]['groups'][j]['clues'][k][:2] + (full_word,)

In [43]:
with open('../static/corpus.js', 'w') as f:
    f.write('var corpus = ')
    json.dump(word_corpus, f)
with open('../static/full_corpus.js', 'w') as f:
    f.write('var fullCorpus = ')
    json.dump(corpus, f)

In [28]:
wn.synsets('malaria')[0].definition()

'an infective disease caused by sporozoan parasites that are transmitted through the bite of an infected Anopheles mosquito; marked by paroxysms of chills and fever'

In [29]:
[len(x['groups']) for x in corpus]

[194, 214, 6853, 437, 6421, 3569]

In [30]:
corpus

[{'name': 'suffixes',
  'groups': [{'answer': 'about',
    'hint': '_ about',
    'clues': [('gad',
      'a restless seeker after amusement or social companionship'),
     ('knock', 'a sloop with a simplified rig and no bowsprit'),
     ('turn', 'a decision to reverse an earlier decision'),
     ('walk', 'a walking trip or tour'),
     ('run', 'an open automobile having a front seat and a rumble seat'),
     ('round',
      'a road junction at which traffic streams circularly around a central island'),
     ('roust', "a member of a ship's crew who performs manual labor"),
     ('lay', 'person who does no work')]},
   {'answer': 'ache',
    'hint': '_ ache',
    'clues': [('head',
      'something or someone that causes anxiety; a source of unhappiness'),
     ('belly', 'an ache localized in the stomach or abdominal region'),
     ('heart',
      'intense sorrow caused by loss of a loved one (especially by death)'),
     ('ear', 'an ache localized in the middle or inner ear'),
     ('b

In [11]:
wn.synonyms('come_up_to')

[['accost', 'address']]