{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 59,
   "id": "e1b17564-0abb-41c5-8cf4-7200b014550f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
      "[nltk_data]   Package words is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import nltk\n",
    "from nltk.corpus import wordnet as wn\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "nltk.download('wordnet')\n",
    "from nltk.corpus import words\n",
    "nltk.download('words')\n",
    "ww = words.words()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "id": "8fe45bc7-a41a-49db-9067-700254f388c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def format(s):\n",
    "    return ' '.join(s.split('_'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 229,
   "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "dsynonyms = defaultdict(set)\n",
    "n=0\n",
    "\n",
    "for word in wn.words():\n",
    "    n+=1\n",
    "    synsets = wn.synsets(word)\n",
    "    synonymss = wn.synonyms(word)\n",
    "    syns = set()\n",
    "    for synset, synonyms in zip(synsets, synonymss):\n",
    "        if synset.pos() in ['a', 's']:\n",
    "            syns |= set(synonyms)\n",
    "    if len(syns) >= 4:\n",
    "        clues = [format(clue) for clue in syns]\n",
    "        ok = True\n",
    "        for clue in clues:\n",
    "            if clue in dsynonyms:\n",
    "                ok = False\n",
    "        if ok:\n",
    "            clues.append(format(word))\n",
    "            dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 230,
   "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Synset('spanish_lime.n.01'), Synset('genip.n.02')]"
      ]
     },
     "execution_count": 230,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wn.synsets('genip')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 231,
   "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# flag button for reporting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1464f8df-180a-4334-b123-d76303140a03",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 232,
   "id": "e588bdf3-d648-48b3-ab6b-027a07194292",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 243,
   "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 244,
   "id": "176e2790-560c-4daf-b436-a1771611c4bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop(df[df.correctParse == 'no'].index)\n",
    "df = df.drop(df[df.isCommonstim == 'no'].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 235,
   "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235",
   "metadata": {},
   "outputs": [],
   "source": [
    "prefixes = df.groupby('c1').groups\n",
    "suffixes = df.groupby('c2').groups\n",
    "pres = []\n",
    "for prefix, ids in prefixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        pres.append((prefix, list(df.loc[list(ids)].c2)))\n",
    "sufs = []\n",
    "for suffix, ids in suffixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        sufs.append((suffix, list(df.loc[list(ids)].c1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 236,
   "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586",
   "metadata": {},
   "outputs": [],
   "source": [
    "dprefix = {}\n",
    "for prefix, ids in pres:\n",
    "    res = set()\n",
    "    for id in ids:\n",
    "        if (id[-1] == 's' and id[:-1] in ids) or (ids[-2:] == 'es' and ids[:-2] in ids):\n",
    "            continue\n",
    "        res.add(id)\n",
    "    if len(res) < 4:\n",
    "        continue\n",
    "    dprefix[prefix] = dict(group=f'{prefix} _', clues=list(res))\n",
    "\n",
    "dsuffix = {}\n",
    "for suffix, ids in sufs:\n",
    "    if (suffix[-1] == 's' and suffix[:-1] in dsuffix) or (suffix[-2:] == 'es' and suffix[:-2] in ids):\n",
    "        #dsuffix[suffix[:-1]] = set(ids)\n",
    "        continue\n",
    "    if len(ids) < 4:\n",
    "        continue\n",
    "    dsuffix[suffix] = dict(group=f'_ {suffix}', clues=ids)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 237,
   "id": "def43999-d789-4e5c-bb27-4fd29074c875",
   "metadata": {},
   "outputs": [],
   "source": [
    "from Levenshtein import ratio\n",
    "def similar(a, b):\n",
    "    return ratio(a, b) >= .8\n",
    "import inflect\n",
    "\n",
    "p = inflect.engine()\n",
    "\n",
    "def normalize(w):\n",
    "    pass\n",
    "\n",
    "def filter_duplicates(group):\n",
    "    if not group:\n",
    "        return []\n",
    "    ok = [group[0]]\n",
    "    for i in range(1, len(group)):\n",
    "        for word in ok:\n",
    "            if similar(word, group[i]):\n",
    "                break\n",
    "        else:\n",
    "            ok.append(group[i])\n",
    "    return ok"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 238,
   "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
   "metadata": {},
   "outputs": [],
   "source": [
    "blacklist = ['man', 'men', 'woman', 'women']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 239,
   "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def process_corpus(corpus):\n",
    "    new = {}\n",
    "    for word, group in corpus.items():\n",
    "        clues = group['clues']\n",
    "        clues = [clue for clue in clues if clue not in blacklist]\n",
    "        clues = filter_duplicates(clues)\n",
    "        if len(clues) < 4:\n",
    "            continue\n",
    "        new[word] = dict(group=group['group'], clues=clues)\n",
    "    return new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 240,
   "id": "a59a4514-2572-4d35-a73d-fef58d1bc804",
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = {**dprefix}\n",
    "corpus.update(dsuffix)\n",
    "corpus.update(dsynonyms)\n",
    "filtered_corpus = process_corpus(corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 259,
   "id": "8025664c-e116-481a-9609-d58200f773ec",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "437 330\n"
     ]
    }
   ],
   "source": [
    "print(len(dprefix), len(dsuffix))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 241,
   "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "with open('../static/corpus.js', 'w') as f:\n",
    "    f.write('var corpus = ')\n",
    "    json.dump(filtered_corpus, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4a82df07-568a-41f9-98c9-be0182522577",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 242,
   "id": "46157b29-1084-4caa-be4f-7c56be562da8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['encroach', 'impinge', 'infringe'],\n",
       " ['encroach', 'entrench', 'impinge', 'trench'],\n",
       " ['invasive', 'trespassing']]"
      ]
     },
     "execution_count": 242,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "wn.synonyms('encroaching')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 252,
   "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c1</th>\n",
       "      <th>c2</th>\n",
       "      <th>stim</th>\n",
       "      <th>isCommonC1</th>\n",
       "      <th>isCommonC2</th>\n",
       "      <th>isCommonstim</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id_master</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>3237</th>\n",
       "      <td>gad</td>\n",
       "      <td>about</td>\n",
       "      <td>gadabout</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4592</th>\n",
       "      <td>knock</td>\n",
       "      <td>about</td>\n",
       "      <td>knockabout</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8231</th>\n",
       "      <td>turn</td>\n",
       "      <td>about</td>\n",
       "      <td>turnabout</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6139</th>\n",
       "      <td>race</td>\n",
       "      <td>about</td>\n",
       "      <td>raceabout</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8331</th>\n",
       "      <td>walk</td>\n",
       "      <td>about</td>\n",
       "      <td>walkabout</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4515</th>\n",
       "      <td>junk</td>\n",
       "      <td>yards</td>\n",
       "      <td>junkyards</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6812</th>\n",
       "      <td>ship</td>\n",
       "      <td>yards</td>\n",
       "      <td>shipyards</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2667</th>\n",
       "      <td>farm</td>\n",
       "      <td>yards</td>\n",
       "      <td>farmyards</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1007</th>\n",
       "      <td>brick</td>\n",
       "      <td>yards</td>\n",
       "      <td>brickyards</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8892</th>\n",
       "      <td>zig</td>\n",
       "      <td>zag</td>\n",
       "      <td>zigzag</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>8372 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "              c1     c2        stim  isCommonC1  isCommonC2  isCommonstim\n",
       "id_master                                                                \n",
       "3237         gad  about    gadabout           1           1             1\n",
       "4592       knock  about  knockabout           1           1             1\n",
       "8231        turn  about   turnabout           1           1             1\n",
       "6139        race  about   raceabout           1           1             0\n",
       "8331        walk  about   walkabout           1           1             1\n",
       "...          ...    ...         ...         ...         ...           ...\n",
       "4515        junk  yards   junkyards           1           0             0\n",
       "6812        ship  yards   shipyards           1           0             0\n",
       "2667        farm  yards   farmyards           1           0             0\n",
       "1007       brick  yards  brickyards           1           0             0\n",
       "8892         zig    zag      zigzag           0           0             1\n",
       "\n",
       "[8372 rows x 6 columns]"
      ]
     },
     "execution_count": 252,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 258,
   "id": "ebcdf335-02c3-480c-a241-f83f7569acb0",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>c1</th>\n",
       "      <th>c2</th>\n",
       "      <th>stim</th>\n",
       "      <th>isCommonC1</th>\n",
       "      <th>isCommonC2</th>\n",
       "      <th>isCommonstim</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>id_master</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8361</th>\n",
       "      <td>war</td>\n",
       "      <td>fare</td>\n",
       "      <td>warfare</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2715</th>\n",
       "      <td>field</td>\n",
       "      <td>fare</td>\n",
       "      <td>fieldfare</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1298</th>\n",
       "      <td>car</td>\n",
       "      <td>fare</td>\n",
       "      <td>carfare</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>air</td>\n",
       "      <td>fare</td>\n",
       "      <td>airfare</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "              c1    c2       stim  isCommonC1  isCommonC2  isCommonstim\n",
       "id_master                                                              \n",
       "8361         war  fare    warfare           1           1             1\n",
       "2715       field  fare  fieldfare           1           1             0\n",
       "1298         car  fare    carfare           1           1             1\n",
       "51           air  fare    airfare           1           1             1"
      ]
     },
     "execution_count": 258,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df.c2=='fare'][['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "50989f8d-368e-4b4d-ab6c-355efce36c93",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}