From 874d6321c955b2be037ac20d7f6b895d3d53be25 Mon Sep 17 00:00:00 2001
From: cyfraeviolae <cyfraeviolae>
Date: Thu, 30 Nov 2023 18:35:07 -0500
Subject: ok

---
 generators/datageneration.ipynb | 317 ++++++++++++++++++++++++++++++++--------
 1 file changed, 252 insertions(+), 65 deletions(-)

(limited to 'generators/datageneration.ipynb')
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb
index ee65ead..85c6e02 100644
--- a/generators/datageneration.ipynb
+++ b/generators/datageneration.ipynb
@@ -41,13 +41,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
-   "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1",
+   "execution_count": 60,
+   "id": "27488c82-1e9b-4873-9e6b-e0b5937fc51d",
    "metadata": {},
    "outputs": [],
    "source": [
     "from collections import defaultdict\n",
-    "\n",
+    "lexnames = set()\n",
+    "lexname_blacklist = {\n",
+    "    'noun.plant',\n",
+    "    'noun.animal',\n",
+    "    'noun.person'\n",
+    "}\n",
     "def groups_for_pos(pos='as'):\n",
     "    dsynonyms = []\n",
     "    for word in wn.words():\n",
@@ -55,11 +60,12 @@
     "        synonymss = wn.synonyms(word)\n",
     "        syns = set()\n",
     "        for synset, synonyms in zip(synsets, synonymss):\n",
-    "            if synset.pos() in pos: # 'as'\n",
+    "            if synset.lexname() in lexname_blacklist:\n",
+    "                continue\n",
+    "            if synset.pos() in pos:\n",
     "                syns |= set(synonyms)\n",
     "        if len(syns) >= 4:\n",
     "            clues = [format(clue) for clue in syns]\n",
-    "            \n",
     "            clues.append(format(word))\n",
     "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
     "            \"\"\"    \n",
@@ -71,8 +77,16 @@
     "                clues.append(format(word))\n",
     "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
     "            \"\"\"\n",
-    "    return dsynonyms\n",
-    "\n",
+    "    return dsynonyms"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "445789e7-3808-4a28-9df5-9a69313cb4c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "dadj = groups_for_pos('as')\n",
     "dverb = groups_for_pos('v')\n",
     "dnoun = groups_for_pos('n')\n",
@@ -81,7 +95,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 30,
    "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
    "metadata": {},
    "outputs": [
@@ -91,7 +105,7 @@
        "(3976, 7141, 19563, 490)"
       ]
      },
-     "execution_count": 4,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -102,20 +116,33 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 62,
    "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(3976, 7141, 13824, 490)"
+      ]
+     },
+     "execution_count": 62,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dadj), len(dverb), len(dnoun), len(dadverb)"
+   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 33,
    "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# flag button for reporting"
+    "# flag button for reporting, definitions in webcollectionscollections"
    ]
   },
   {
@@ -129,7 +156,115 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b",
+   "id": "f4eb0f95-901d-43d8-8ba7-6103b3a0f6be",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "bdcb3a19-a5cf-48e9-ab6f-aaeed50e2c31",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'adj.all',\n",
+       " 'adj.pert',\n",
+       " 'adj.ppl',\n",
+       " 'adv.all',\n",
+       " 'noun.Tops',\n",
+       " 'noun.act',\n",
+       " 'noun.animal',\n",
+       " 'noun.artifact',\n",
+       " 'noun.attribute',\n",
+       " 'noun.body',\n",
+       " 'noun.cognition',\n",
+       " 'noun.communication',\n",
+       " 'noun.event',\n",
+       " 'noun.feeling',\n",
+       " 'noun.food',\n",
+       " 'noun.group',\n",
+       " 'noun.location',\n",
+       " 'noun.motive',\n",
+       " 'noun.object',\n",
+       " 'noun.person',\n",
+       " 'noun.phenomenon',\n",
+       " 'noun.plant',\n",
+       " 'noun.possession',\n",
+       " 'noun.process',\n",
+       " 'noun.quantity',\n",
+       " 'noun.relation',\n",
+       " 'noun.shape',\n",
+       " 'noun.state',\n",
+       " 'noun.substance',\n",
+       " 'noun.time',\n",
+       " 'verb.body',\n",
+       " 'verb.change',\n",
+       " 'verb.cognition',\n",
+       " 'verb.communication',\n",
+       " 'verb.competition',\n",
+       " 'verb.consumption',\n",
+       " 'verb.contact',\n",
+       " 'verb.creation',\n",
+       " 'verb.emotion',\n",
+       " 'verb.motion',\n",
+       " 'verb.perception',\n",
+       " 'verb.possession',\n",
+       " 'verb.social',\n",
+       " 'verb.stative',\n",
+       " 'verb.weather'}"
+      ]
+     },
+     "execution_count": 57,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "id": "d3a0f1e9-99e5-4aa2-bd72-d1d200bf2b40",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'noun.plant'"
+      ]
+     },
+     "execution_count": 50,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "syn.lexname()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "64ad6047-a09b-41c3-85b6-65476d8dba0e",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88a4aeab-8123-435e-9b44-3e568102f0b1",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6459fdf-fe4e-4dcf-81ef-1688de01be95",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -194,26 +329,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 10,
    "id": "def43999-d789-4e5c-bb27-4fd29074c875",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "IndentationError",
-     "evalue": "unindent does not match any outer indentation level (<tokenize>, line 84)",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;36m  File \u001b[0;32m<tokenize>:84\u001b[0;36m\u001b[0m\n\u001b[0;31m    r = ratio(a, b)\u001b[0m\n\u001b[0m    ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
     "from Levenshtein import ratio\n",
     "def similar(a, b):\n",
     "    a, b = a.lower(), b.lower()\n",
     "    if len(a) > len(b):\n",
     "        a, b = b, a\n",
-    "    if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b\n",
+    "    if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b:\n",
+    "        return True\n",
+    "    r = ratio(a, b)\n",
+    "    if .8 <= r <= .9:\n",
+    "        pass\n",
+    "        #print(a, b, r)\n",
+    "    return r >= .85\n",
+    "\"\"\"\n",
     "Skip to Main\n",
     "datageneration\n",
     "Last Checkpoint: 11 days ago\n",
@@ -250,7 +383,7 @@
     "            \n",
     "            clues.append(format(word))\n",
     "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
-    "            \"\"\"    \n",
+    "            \n",
     "            ok = True\n",
     "            for clue in clues:\n",
     "                if clue in dsynonyms:\n",
@@ -258,7 +391,7 @@
     "            if ok:\n",
     "                clues.append(format(word))\n",
     "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
-    "            \"\"\"\n",
+    "            \n",
     "    return dsynonyms\n",
     "\n",
     "dadj = groups_for_pos('as')\n",
@@ -291,12 +424,7 @@
     ":\n",
     "        return True\n",
     "    # Then, print everything between .8 and .9 to see whats the best cutoff\n",
-    "    r = ratio(a, b)\n",
-    "    if .8 <= r <= .9:\n",
-    "        pass\n",
-    "        #print(a, b, r)\n",
-    "    return r >= .85\n",
-    "\n",
+    "\"\"\"\n",
     "def filter_duplicates(group):\n",
     "    if not group:\n",
     "        return []\n",
@@ -312,7 +440,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 11,
    "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
    "metadata": {},
    "outputs": [],
@@ -322,7 +450,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 63,
    "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
    "metadata": {},
    "outputs": [],
@@ -341,37 +469,45 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 64,
    "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
    "metadata": {},
    "outputs": [],
    "source": [
-    "corpus = dict(\n",
-    "    adjectives=dadj,\n",
-    "    nouns=dnoun,\n",
-    "    adverbs=dadverb,\n",
-    "    verbs=dverb,\n",
-    "    prefixes=dprefix,\n",
-    "    suffixes=dsuffix,\n",
-    ")\n",
-    "filtered_corpus = {}\n",
-    "for k, d in corpus.items():\n",
-    "    filtered_corpus[k] = process_groups(d)"
+    "corpus = [\n",
+    "    dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n",
+    "    dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n",
+    "    dict(name='verbs', groups=process_groups(dverb), portion=.6),\n",
+    "    dict(name='adverbs', groups=process_groups(dadverb), portion=.54),\n",
+    "    dict(name='nouns', groups=process_groups(dnoun), portion=.2),\n",
+    "    dict(name='adjectives', groups=process_groups(dadj), portion=0),\n",
+    "]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 65,
    "id": "b35092c5-17db-4257-bf45-83e8c3973da4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Synset('basil_thyme.n.01')]"
+      ]
+     },
+     "execution_count": 65,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "filtered_corpus['adverbs']"
+    "wn.synsets('satureja_acinos')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 66,
    "id": "8025664c-e116-481a-9609-d58200f773ec",
    "metadata": {},
    "outputs": [],
@@ -379,17 +515,74 @@
     "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "77256802-04ef-4908-9b39-e9381f6abac5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(13824, 3976, 490, 7141, 226, 196)"
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "id": "66256c50-fcbc-42ad-a17b-057fd0d7dea1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "25853"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sum((len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)))"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca",
+   "id": "635b6a6c-c084-4584-a63f-cde4221e0ad9",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f48e2eec-2ab1-4398-a862-172398c413a0",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
     "with open('../static/corpus.js', 'w') as f:\n",
     "    f.write('var corpus = ')\n",
-    "    json.dump(filtered_corpus, f)"
+    "    json.dump(corpus, f)"
    ]
   },
   {
@@ -398,9 +591,7 @@
    "id": "551ce71f-0d75-4e41-8387-808db1e5e20f",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "similar('slow', 'slowl')"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -419,9 +610,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "len(filtered_corpus)"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
@@ -440,9 +629,7 @@
      "output_type": "execute_result"
     }
    ],
-   "source": [
-    "len(dsuffix)"
-   ]
+   "source": []
   },
   {
    "cell_type": "code",
-- 
cgit v1.2.3