{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "e1b17564-0abb-41c5-8cf4-7200b014550f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
      "[nltk_data]   Package wordnet is already up-to-date!\n",
      "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
      "[nltk_data]   Package words is already up-to-date!\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "import nltk\n",
    "from nltk.corpus import wordnet as wn\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "nltk.download('wordnet')\n",
    "from nltk.corpus import words\n",
    "nltk.download('words')\n",
    "ww = words.words()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8fe45bc7-a41a-49db-9067-700254f388c0",
   "metadata": {},
   "outputs": [],
   "source": [
    "def format(s):\n",
    "    return ' '.join(s.split('_'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import defaultdict\n",
    "\n",
    "def groups_for_pos(pos='as'):\n",
    "    dsynonyms = []\n",
    "    for word in wn.words():\n",
    "        synsets = wn.synsets(word)\n",
    "        synonymss = wn.synonyms(word)\n",
    "        syns = set()\n",
    "        for synset, synonyms in zip(synsets, synonymss):\n",
    "            if synset.pos() in pos: # 'as'\n",
    "                syns |= set(synonyms)\n",
    "        if len(syns) >= 4:\n",
    "            clues = [format(clue) for clue in syns]\n",
    "            \n",
    "            clues.append(format(word))\n",
    "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
    "            \"\"\"    \n",
    "            ok = True\n",
    "            for clue in clues:\n",
    "                if clue in dsynonyms:\n",
    "                    ok = False\n",
    "            if ok:\n",
    "                clues.append(format(word))\n",
    "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
    "            \"\"\"\n",
    "    return dsynonyms\n",
    "\n",
    "dadj = groups_for_pos('as')\n",
    "dverb = groups_for_pos('v')\n",
    "dnoun = groups_for_pos('n')\n",
    "dadverb = groups_for_pos('r')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3976, 7141, 19563, 490)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dadj), len(dverb), len(dnoun), len(dadverb)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# flag button for reporting"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1464f8df-180a-4334-b123-d76303140a03",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "e588bdf3-d648-48b3-ab6b-027a07194292",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "176e2790-560c-4daf-b436-a1771611c4bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "df = df.drop(df[df.correctParse == 'no'].index)\n",
    "df = df.drop(df[df.isCommonstim == 0].index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235",
   "metadata": {},
   "outputs": [],
   "source": [
    "prefixes = df.groupby('c1').groups\n",
    "suffixes = df.groupby('c2').groups\n",
    "dprefix = []\n",
    "for prefix, ids in prefixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n",
    "dsuffix = []\n",
    "for suffix, ids in suffixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "def43999-d789-4e5c-bb27-4fd29074c875",
   "metadata": {},
   "outputs": [
    {
     "ename": "IndentationError",
     "evalue": "unindent does not match any outer indentation level (<tokenize>, line 84)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;36m  File \u001b[0;32m<tokenize>:84\u001b[0;36m\u001b[0m\n\u001b[0;31m    r = ratio(a, b)\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
     ]
    }
   ],
   "source": [
    "from Levenshtein import ratio\n",
    "def similar(a, b):\n",
    "    a, b = a.lower(), b.lower()\n",
    "    if len(a) > len(b):\n",
    "        a, b = b, a\n",
    "    if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b\n",
    "Skip to Main\n",
    "datageneration\n",
    "Last Checkpoint: 11 days ago\n",
    "[Python 3 (ipykernel)]\n",
    "import json\n",
    "import nltk\n",
    "from nltk.corpus import wordnet as wn\n",
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "nltk.download('wordnet')\n",
    "from nltk.corpus import words\n",
    "nltk.download('words')\n",
    "ww = words.words()\n",
    "\n",
    "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
    "[nltk_data]   Package wordnet is already up-to-date!\n",
    "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
    "[nltk_data]   Package words is already up-to-date!\n",
    "\n",
    "def format(s):\n",
    "    return ' '.join(s.split('_'))\n",
    "from collections import defaultdict\n",
    "\n",
    "def groups_for_pos(pos='as'):\n",
    "    dsynonyms = []\n",
    "    for word in wn.words():\n",
    "        synsets = wn.synsets(word)\n",
    "        synonymss = wn.synonyms(word)\n",
    "        syns = set()\n",
    "        for synset, synonyms in zip(synsets, synonymss):\n",
    "            if synset.pos() in pos: # 'as'\n",
    "                syns |= set(synonyms)\n",
    "        if len(syns) >= 4:\n",
    "            clues = [format(clue) for clue in syns]\n",
    "            \n",
    "            clues.append(format(word))\n",
    "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
    "            \"\"\"    \n",
    "            ok = True\n",
    "            for clue in clues:\n",
    "                if clue in dsynonyms:\n",
    "                    ok = False\n",
    "            if ok:\n",
    "                clues.append(format(word))\n",
    "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
    "            \"\"\"\n",
    "    return dsynonyms\n",
    "\n",
    "dadj = groups_for_pos('as')\n",
    "dverb = groups_for_pos('v')\n",
    "dnoun = groups_for_pos('n')\n",
    "dadverb = groups_for_pos('r')\n",
    "len(dadj), len(dverb), len(dnoun), len(dadverb)\n",
    "\n",
    "(3976, 7141, 19563, 490)\n",
    "\n",
    "\n",
    "# flag button for reporting\n",
    "\n",
    "\n",
    "import pandas as pd\n",
    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n",
    "df = df.drop(df[df.correctParse == 'no'].index)\n",
    "df = df.drop(df[df.isCommonstim == 0].index)\n",
    "prefixes = df.groupby('c1').groups\n",
    "suffixes = df.groupby('c2').groups\n",
    "dprefix = []\n",
    "for prefix, ids in prefixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n",
    "dsuffix = []\n",
    "for suffix, ids in suffixes.items():\n",
    "    if len(ids) >= 4:\n",
    "        dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n",
    "\n",
    ":\n",
    "        return True\n",
    "    # Then, print everything between .8 and .9 to see whats the best cutoff\n",
    "    r = ratio(a, b)\n",
    "    if .8 <= r <= .9:\n",
    "        pass\n",
    "        #print(a, b, r)\n",
    "    return r >= .85\n",
    "\n",
    "def filter_duplicates(group):\n",
    "    if not group:\n",
    "        return []\n",
    "    ok = [group[0]]\n",
    "    for i in range(1, len(group)):\n",
    "        for word in ok:\n",
    "            if similar(word, group[i]):\n",
    "                break\n",
    "        else:\n",
    "            ok.append(group[i])\n",
    "    return ok"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
   "metadata": {},
   "outputs": [],
   "source": [
    "blacklist = ['man', 'men', 'woman', 'women']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_groups(groups):\n",
    "    new = []\n",
    "    for group in groups:\n",
    "        clues = group['clues']\n",
    "        clues = [clue for clue in clues if clue not in blacklist]\n",
    "        clues = filter_duplicates(clues)\n",
    "        if len(clues) < 4:\n",
    "            continue\n",
    "        new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n",
    "    return new"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus = dict(\n",
    "    adjectives=dadj,\n",
    "    nouns=dnoun,\n",
    "    adverbs=dadverb,\n",
    "    verbs=dverb,\n",
    "    prefixes=dprefix,\n",
    "    suffixes=dsuffix,\n",
    ")\n",
    "filtered_corpus = {}\n",
    "for k, d in corpus.items():\n",
    "    filtered_corpus[k] = process_groups(d)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b35092c5-17db-4257-bf45-83e8c3973da4",
   "metadata": {},
   "outputs": [],
   "source": [
    "filtered_corpus['adverbs']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8025664c-e116-481a-9609-d58200f773ec",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "with open('../static/corpus.js', 'w') as f:\n",
    "    f.write('var corpus = ')\n",
    "    json.dump(filtered_corpus, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "551ce71f-0d75-4e41-8387-808db1e5e20f",
   "metadata": {},
   "outputs": [],
   "source": [
    "similar('slow', 'slowl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "19589357-f1ca-4d10-8574-3639bd05173f",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "26388"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(filtered_corpus)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "id": "dd927be9-a77c-4606-984a-b3cf555b2618",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "195"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(dsuffix)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77cb04f6-846e-454b-98e1-4feb575d2332",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}