From 27f01fe7ab0e4264b76539f95c5d9c178ebd77c4 Mon Sep 17 00:00:00 2001
From: cyfraeviolae <cyfraeviolae>
Date: Thu, 30 Nov 2023 18:47:54 -0500
Subject: new words

---
 generators/datageneration.ipynb | 470 ++++------------------------------------
 1 file changed, 46 insertions(+), 424 deletions(-)

(limited to 'generators/datageneration.ipynb')
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb
index 85c6e02..9bb1407 100644
--- a/generators/datageneration.ipynb
+++ b/generators/datageneration.ipynb
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 38,
    "id": "e1b17564-0abb-41c5-8cf4-7200b014550f",
    "metadata": {},
    "outputs": [
@@ -25,18 +25,9 @@
     "nltk.download('wordnet')\n",
     "from nltk.corpus import words\n",
     "nltk.download('words')\n",
-    "ww = words.words()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "8fe45bc7-a41a-49db-9067-700254f388c0",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def format(s):\n",
-    "    return ' '.join(s.split('_'))"
+    "ww = words.words()\n",
+    "import pandas as pd \n",
+    "import string"
    ]
   },
   {
@@ -46,16 +37,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "def format(s):\n",
+    "    return ' '.join(s.split('_'))\n",
     "from collections import defaultdict\n",
     "lexnames = set()\n",
     "lexname_blacklist = {\n",
     "    'noun.plant',\n",
     "    'noun.animal',\n",
-    "    'noun.person'\n",
+    "    'noun.person',\n",
+    "    'noun.state',\n",
+    "    'noun.body',\n",
+    "    #'noun.location',\n",
+    "    #'noun.group',\n",
     "}\n",
+    "def is_proper(w):\n",
+    "    return w[0] in string.ascii_uppercase\n",
     "def groups_for_pos(pos='as'):\n",
     "    dsynonyms = []\n",
     "    for word in wn.words():\n",
+    "        if is_proper(word):\n",
+    "            continue\n",
     "        synsets = wn.synsets(word)\n",
     "        synonymss = wn.synonyms(word)\n",
     "        syns = set()\n",
@@ -63,20 +64,11 @@
     "            if synset.lexname() in lexname_blacklist:\n",
     "                continue\n",
     "            if synset.pos() in pos:\n",
-    "                syns |= set(synonyms)\n",
+    "                syns |= set(syn for syn in synonyms if not is_proper(syn))\n",
     "        if len(syns) >= 4:\n",
     "            clues = [format(clue) for clue in syns]\n",
     "            clues.append(format(word))\n",
     "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
-    "            \"\"\"    \n",
-    "            ok = True\n",
-    "            for clue in clues:\n",
-    "                if clue in dsynonyms:\n",
-    "                    ok = False\n",
-    "            if ok:\n",
-    "                clues.append(format(word))\n",
-    "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
-    "            \"\"\"\n",
     "    return dsynonyms"
    ]
   },
@@ -93,220 +85,16 @@
     "dadverb = groups_for_pos('r')"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(3976, 7141, 19563, 490)"
-      ]
-     },
-     "execution_count": 30,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(dadj), len(dverb), len(dnoun), len(dadverb)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 62,
-   "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "(3976, 7141, 13824, 490)"
-      ]
-     },
-     "execution_count": 62,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "len(dadj), len(dverb), len(dnoun), len(dadverb)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 33,
-   "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# flag button for reporting, definitions in webcollectionscollections"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1464f8df-180a-4334-b123-d76303140a03",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f4eb0f95-901d-43d8-8ba7-6103b3a0f6be",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 57,
-   "id": "bdcb3a19-a5cf-48e9-ab6f-aaeed50e2c31",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "{'adj.all',\n",
-       " 'adj.pert',\n",
-       " 'adj.ppl',\n",
-       " 'adv.all',\n",
-       " 'noun.Tops',\n",
-       " 'noun.act',\n",
-       " 'noun.animal',\n",
-       " 'noun.artifact',\n",
-       " 'noun.attribute',\n",
-       " 'noun.body',\n",
-       " 'noun.cognition',\n",
-       " 'noun.communication',\n",
-       " 'noun.event',\n",
-       " 'noun.feeling',\n",
-       " 'noun.food',\n",
-       " 'noun.group',\n",
-       " 'noun.location',\n",
-       " 'noun.motive',\n",
-       " 'noun.object',\n",
-       " 'noun.person',\n",
-       " 'noun.phenomenon',\n",
-       " 'noun.plant',\n",
-       " 'noun.possession',\n",
-       " 'noun.process',\n",
-       " 'noun.quantity',\n",
-       " 'noun.relation',\n",
-       " 'noun.shape',\n",
-       " 'noun.state',\n",
-       " 'noun.substance',\n",
-       " 'noun.time',\n",
-       " 'verb.body',\n",
-       " 'verb.change',\n",
-       " 'verb.cognition',\n",
-       " 'verb.communication',\n",
-       " 'verb.competition',\n",
-       " 'verb.consumption',\n",
-       " 'verb.contact',\n",
-       " 'verb.creation',\n",
-       " 'verb.emotion',\n",
-       " 'verb.motion',\n",
-       " 'verb.perception',\n",
-       " 'verb.possession',\n",
-       " 'verb.social',\n",
-       " 'verb.stative',\n",
-       " 'verb.weather'}"
-      ]
-     },
-     "execution_count": 57,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 50,
-   "id": "d3a0f1e9-99e5-4aa2-bd72-d1d200bf2b40",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'noun.plant'"
-      ]
-     },
-     "execution_count": 50,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "syn.lexname()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "64ad6047-a09b-41c3-85b6-65476d8dba0e",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "88a4aeab-8123-435e-9b44-3e568102f0b1",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b6459fdf-fe4e-4dcf-81ef-1688de01be95",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "e588bdf3-d648-48b3-ab6b-027a07194292",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "176e2790-560c-4daf-b436-a1771611c4bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = df.drop(df[df.correctParse == 'no'].index)\n",
-    "df = df.drop(df[df.isCommonstim == 0].index)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
    "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235",
    "metadata": {},
    "outputs": [],
    "source": [
+    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n",
+    "df = df.drop(df[df.correctParse == 'no'].index)\n",
+    "df = df.drop(df[df.isCommonstim == 0].index)\n",
     "prefixes = df.groupby('c1').groups\n",
     "suffixes = df.groupby('c2').groups\n",
     "dprefix = []\n",
@@ -321,15 +109,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586",
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 63,
    "id": "def43999-d789-4e5c-bb27-4fd29074c875",
    "metadata": {},
    "outputs": [],
@@ -346,85 +126,6 @@
     "        pass\n",
     "        #print(a, b, r)\n",
     "    return r >= .85\n",
-    "\"\"\"\n",
-    "Skip to Main\n",
-    "datageneration\n",
-    "Last Checkpoint: 11 days ago\n",
-    "[Python 3 (ipykernel)]\n",
-    "import json\n",
-    "import nltk\n",
-    "from nltk.corpus import wordnet as wn\n",
-    "from nltk.stem.wordnet import WordNetLemmatizer\n",
-    "nltk.download('wordnet')\n",
-    "from nltk.corpus import words\n",
-    "nltk.download('words')\n",
-    "ww = words.words()\n",
-    "\n",
-    "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
-    "[nltk_data]   Package wordnet is already up-to-date!\n",
-    "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
-    "[nltk_data]   Package words is already up-to-date!\n",
-    "\n",
-    "def format(s):\n",
-    "    return ' '.join(s.split('_'))\n",
-    "from collections import defaultdict\n",
-    "\n",
-    "def groups_for_pos(pos='as'):\n",
-    "    dsynonyms = []\n",
-    "    for word in wn.words():\n",
-    "        synsets = wn.synsets(word)\n",
-    "        synonymss = wn.synonyms(word)\n",
-    "        syns = set()\n",
-    "        for synset, synonyms in zip(synsets, synonymss):\n",
-    "            if synset.pos() in pos: # 'as'\n",
-    "                syns |= set(synonyms)\n",
-    "        if len(syns) >= 4:\n",
-    "            clues = [format(clue) for clue in syns]\n",
-    "            \n",
-    "            clues.append(format(word))\n",
-    "            dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
-    "            \n",
-    "            ok = True\n",
-    "            for clue in clues:\n",
-    "                if clue in dsynonyms:\n",
-    "                    ok = False\n",
-    "            if ok:\n",
-    "                clues.append(format(word))\n",
-    "                dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
-    "            \n",
-    "    return dsynonyms\n",
-    "\n",
-    "dadj = groups_for_pos('as')\n",
-    "dverb = groups_for_pos('v')\n",
-    "dnoun = groups_for_pos('n')\n",
-    "dadverb = groups_for_pos('r')\n",
-    "len(dadj), len(dverb), len(dnoun), len(dadverb)\n",
-    "\n",
-    "(3976, 7141, 19563, 490)\n",
-    "\n",
-    "\n",
-    "# flag button for reporting\n",
-    "\n",
-    "\n",
-    "import pandas as pd\n",
-    "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n",
-    "df = df.drop(df[df.correctParse == 'no'].index)\n",
-    "df = df.drop(df[df.isCommonstim == 0].index)\n",
-    "prefixes = df.groupby('c1').groups\n",
-    "suffixes = df.groupby('c2').groups\n",
-    "dprefix = []\n",
-    "for prefix, ids in prefixes.items():\n",
-    "    if len(ids) >= 4:\n",
-    "        dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n",
-    "dsuffix = []\n",
-    "for suffix, ids in suffixes.items():\n",
-    "    if len(ids) >= 4:\n",
-    "        dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n",
-    "\n",
-    ":\n",
-    "        return True\n",
-    "    # Then, print everything between .8 and .9 to see whats the best cutoff\n",
-    "\"\"\"\n",
     "def filter_duplicates(group):\n",
     "    if not group:\n",
     "        return []\n",
@@ -440,21 +141,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "blacklist = ['man', 'men', 'woman', 'women']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 63,
-   "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
+   "execution_count": 64,
+   "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
    "metadata": {},
    "outputs": [],
    "source": [
+    "blacklist = ['man', 'men', 'woman', 'women']\n",
     "def process_groups(groups):\n",
     "    new = []\n",
     "    for group in groups:\n",
@@ -464,16 +156,7 @@
     "        if len(clues) < 4:\n",
     "            continue\n",
     "        new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n",
-    "    return new"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 64,
-   "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
+    "    return new\n",
     "corpus = [\n",
     "    dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n",
     "    dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n",
@@ -487,80 +170,61 @@
   {
    "cell_type": "code",
    "execution_count": 65,
-   "id": "b35092c5-17db-4257-bf45-83e8c3973da4",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[Synset('basil_thyme.n.01')]"
-      ]
-     },
-     "execution_count": 65,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "wn.synsets('satureja_acinos')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 66,
-   "id": "8025664c-e116-481a-9609-d58200f773ec",
+   "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong"
+    "with open('../static/corpus.js', 'w') as f:\n",
+    "    f.write('var corpus = ')\n",
+    "    json.dump(corpus, f)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
-   "id": "77256802-04ef-4908-9b39-e9381f6abac5",
+   "execution_count": 66,
+   "id": "589f6645-3a52-40ad-9899-717ea7614d00",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(13824, 3976, 490, 7141, 226, 196)"
+       "'noun.group'"
       ]
      },
-     "execution_count": 67,
+     "execution_count": 66,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)"
+    "wn.synsets('Islamic_jihad')[0].lexname()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 68,
-   "id": "66256c50-fcbc-42ad-a17b-057fd0d7dea1",
+   "execution_count": 67,
+   "id": "8faeb5ee-e1ff-4571-91bf-178cdc7d29f7",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "25853"
+       "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"
       ]
      },
-     "execution_count": 68,
+     "execution_count": 67,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "sum((len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)))"
+    "    string.ascii_uppercase"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "635b6a6c-c084-4584-a63f-cde4221e0ad9",
+   "id": "b8d12e1a-757f-4d87-9374-e7eb656f30d4",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -568,73 +232,31 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f48e2eec-2ab1-4398-a862-172398c413a0",
+   "id": "68f254d5-7a96-488a-b1a8-6a82dce44271",
    "metadata": {},
    "outputs": [],
    "source": []
   },
-  {
-   "cell_type": "code",
-   "execution_count": 70,
-   "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "with open('../static/corpus.js', 'w') as f:\n",
-    "    f.write('var corpus = ')\n",
-    "    json.dump(corpus, f)"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "551ce71f-0d75-4e41-8387-808db1e5e20f",
+   "id": "091e0422-99be-4b20-a20d-17ac296990b6",
    "metadata": {},
    "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
-   "execution_count": 38,
-   "id": "19589357-f1ca-4d10-8574-3639bd05173f",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "26388"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "id": "dd927be9-a77c-4606-984a-b3cf555b2618",
+   "execution_count": null,
+   "id": "ff46e8c4-b806-43f6-8cf9-d4260bad1ba8",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "195"
-      ]
-     },
-     "execution_count": 39,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": []
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1",
+   "id": "668b25f4-6992-41a3-881c-3a6a72ba0d77",
    "metadata": {},
    "outputs": [],
    "source": []
@@ -642,7 +264,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "77cb04f6-846e-454b-98e1-4feb575d2332",
+   "id": "1a0ae159-2a40-4ca5-915a-6ca40df3080f",
    "metadata": {},
    "outputs": [],
    "source": []
-- 
cgit v1.2.3