summaryrefslogtreecommitdiff
path: root/generators/datageneration.ipynb
diff options
context:
space:
mode:
authorcyfraeviolae <cyfraeviolae>2023-11-30 18:47:54 -0500
committercyfraeviolae <cyfraeviolae>2023-11-30 18:47:54 -0500
commit27f01fe7ab0e4264b76539f95c5d9c178ebd77c4 (patch)
tree9acce4e5f8474ee4c5aa44c7cc98dd047dc49a03 /generators/datageneration.ipynb
parent874d6321c955b2be037ac20d7f6b895d3d53be25 (diff)
new words
Diffstat (limited to 'generators/datageneration.ipynb')
-rw-r--r--generators/datageneration.ipynb470
1 files changed, 46 insertions, 424 deletions
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb
index 85c6e02..9bb1407 100644
--- a/generators/datageneration.ipynb
+++ b/generators/datageneration.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 38,
"id": "e1b17564-0abb-41c5-8cf4-7200b014550f",
"metadata": {},
"outputs": [
@@ -25,18 +25,9 @@
"nltk.download('wordnet')\n",
"from nltk.corpus import words\n",
"nltk.download('words')\n",
- "ww = words.words()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "8fe45bc7-a41a-49db-9067-700254f388c0",
- "metadata": {},
- "outputs": [],
- "source": [
- "def format(s):\n",
- " return ' '.join(s.split('_'))"
+ "ww = words.words()\n",
+ "import pandas as pd \n",
+ "import string"
]
},
{
@@ -46,16 +37,26 @@
"metadata": {},
"outputs": [],
"source": [
+ "def format(s):\n",
+ " return ' '.join(s.split('_'))\n",
"from collections import defaultdict\n",
"lexnames = set()\n",
"lexname_blacklist = {\n",
" 'noun.plant',\n",
" 'noun.animal',\n",
- " 'noun.person'\n",
+ " 'noun.person',\n",
+ " 'noun.state',\n",
+ " 'noun.body',\n",
+ " #'noun.location',\n",
+ " #'noun.group',\n",
"}\n",
+ "def is_proper(w):\n",
+ " return w[0] in string.ascii_uppercase\n",
"def groups_for_pos(pos='as'):\n",
" dsynonyms = []\n",
" for word in wn.words():\n",
+ " if is_proper(word):\n",
+ " continue\n",
" synsets = wn.synsets(word)\n",
" synonymss = wn.synonyms(word)\n",
" syns = set()\n",
@@ -63,20 +64,11 @@
" if synset.lexname() in lexname_blacklist:\n",
" continue\n",
" if synset.pos() in pos:\n",
- " syns |= set(synonyms)\n",
+ " syns |= set(syn for syn in synonyms if not is_proper(syn))\n",
" if len(syns) >= 4:\n",
" clues = [format(clue) for clue in syns]\n",
" clues.append(format(word))\n",
" dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
- " \"\"\" \n",
- " ok = True\n",
- " for clue in clues:\n",
- " if clue in dsynonyms:\n",
- " ok = False\n",
- " if ok:\n",
- " clues.append(format(word))\n",
- " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
- " \"\"\"\n",
" return dsynonyms"
]
},
@@ -95,218 +87,14 @@
},
{
"cell_type": "code",
- "execution_count": 30,
- "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(3976, 7141, 19563, 490)"
- ]
- },
- "execution_count": 30,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dadj), len(dverb), len(dnoun), len(dadverb)"
- ]
- },
- {
- "cell_type": "code",
"execution_count": 62,
- "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "(3976, 7141, 13824, 490)"
- ]
- },
- "execution_count": 62,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "len(dadj), len(dverb), len(dnoun), len(dadverb)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 33,
- "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
- "metadata": {},
- "outputs": [],
- "source": [
- "# flag button for reporting, definitions in webcollectionscollections"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1464f8df-180a-4334-b123-d76303140a03",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "f4eb0f95-901d-43d8-8ba7-6103b3a0f6be",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 57,
- "id": "bdcb3a19-a5cf-48e9-ab6f-aaeed50e2c31",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'adj.all',\n",
- " 'adj.pert',\n",
- " 'adj.ppl',\n",
- " 'adv.all',\n",
- " 'noun.Tops',\n",
- " 'noun.act',\n",
- " 'noun.animal',\n",
- " 'noun.artifact',\n",
- " 'noun.attribute',\n",
- " 'noun.body',\n",
- " 'noun.cognition',\n",
- " 'noun.communication',\n",
- " 'noun.event',\n",
- " 'noun.feeling',\n",
- " 'noun.food',\n",
- " 'noun.group',\n",
- " 'noun.location',\n",
- " 'noun.motive',\n",
- " 'noun.object',\n",
- " 'noun.person',\n",
- " 'noun.phenomenon',\n",
- " 'noun.plant',\n",
- " 'noun.possession',\n",
- " 'noun.process',\n",
- " 'noun.quantity',\n",
- " 'noun.relation',\n",
- " 'noun.shape',\n",
- " 'noun.state',\n",
- " 'noun.substance',\n",
- " 'noun.time',\n",
- " 'verb.body',\n",
- " 'verb.change',\n",
- " 'verb.cognition',\n",
- " 'verb.communication',\n",
- " 'verb.competition',\n",
- " 'verb.consumption',\n",
- " 'verb.contact',\n",
- " 'verb.creation',\n",
- " 'verb.emotion',\n",
- " 'verb.motion',\n",
- " 'verb.perception',\n",
- " 'verb.possession',\n",
- " 'verb.social',\n",
- " 'verb.stative',\n",
- " 'verb.weather'}"
- ]
- },
- "execution_count": 57,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 50,
- "id": "d3a0f1e9-99e5-4aa2-bd72-d1d200bf2b40",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "'noun.plant'"
- ]
- },
- "execution_count": 50,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "syn.lexname()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "64ad6047-a09b-41c3-85b6-65476d8dba0e",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "88a4aeab-8123-435e-9b44-3e568102f0b1",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "b6459fdf-fe4e-4dcf-81ef-1688de01be95",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "id": "e588bdf3-d648-48b3-ab6b-027a07194292",
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_csv('LADECv1-2019.csv', index_col=0)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "id": "176e2790-560c-4daf-b436-a1771611c4bf",
- "metadata": {},
- "outputs": [],
- "source": [
- "df = df.drop(df[df.correctParse == 'no'].index)\n",
- "df = df.drop(df[df.isCommonstim == 0].index)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
"id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235",
"metadata": {},
"outputs": [],
"source": [
+ "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n",
+ "df = df.drop(df[df.correctParse == 'no'].index)\n",
+ "df = df.drop(df[df.isCommonstim == 0].index)\n",
"prefixes = df.groupby('c1').groups\n",
"suffixes = df.groupby('c2').groups\n",
"dprefix = []\n",
@@ -321,15 +109,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586",
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 10,
+ "execution_count": 63,
"id": "def43999-d789-4e5c-bb27-4fd29074c875",
"metadata": {},
"outputs": [],
@@ -346,85 +126,6 @@
" pass\n",
" #print(a, b, r)\n",
" return r >= .85\n",
- "\"\"\"\n",
- "Skip to Main\n",
- "datageneration\n",
- "Last Checkpoint: 11 days ago\n",
- "[Python 3 (ipykernel)]\n",
- "import json\n",
- "import nltk\n",
- "from nltk.corpus import wordnet as wn\n",
- "from nltk.stem.wordnet import WordNetLemmatizer\n",
- "nltk.download('wordnet')\n",
- "from nltk.corpus import words\n",
- "nltk.download('words')\n",
- "ww = words.words()\n",
- "\n",
- "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
- "[nltk_data] Package wordnet is already up-to-date!\n",
- "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
- "[nltk_data] Package words is already up-to-date!\n",
- "\n",
- "def format(s):\n",
- " return ' '.join(s.split('_'))\n",
- "from collections import defaultdict\n",
- "\n",
- "def groups_for_pos(pos='as'):\n",
- " dsynonyms = []\n",
- " for word in wn.words():\n",
- " synsets = wn.synsets(word)\n",
- " synonymss = wn.synonyms(word)\n",
- " syns = set()\n",
- " for synset, synonyms in zip(synsets, synonymss):\n",
- " if synset.pos() in pos: # 'as'\n",
- " syns |= set(synonyms)\n",
- " if len(syns) >= 4:\n",
- " clues = [format(clue) for clue in syns]\n",
- " \n",
- " clues.append(format(word))\n",
- " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n",
- " \n",
- " ok = True\n",
- " for clue in clues:\n",
- " if clue in dsynonyms:\n",
- " ok = False\n",
- " if ok:\n",
- " clues.append(format(word))\n",
- " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n",
- " \n",
- " return dsynonyms\n",
- "\n",
- "dadj = groups_for_pos('as')\n",
- "dverb = groups_for_pos('v')\n",
- "dnoun = groups_for_pos('n')\n",
- "dadverb = groups_for_pos('r')\n",
- "len(dadj), len(dverb), len(dnoun), len(dadverb)\n",
- "\n",
- "(3976, 7141, 19563, 490)\n",
- "\n",
- "\n",
- "# flag button for reporting\n",
- "\n",
- "\n",
- "import pandas as pd\n",
- "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n",
- "df = df.drop(df[df.correctParse == 'no'].index)\n",
- "df = df.drop(df[df.isCommonstim == 0].index)\n",
- "prefixes = df.groupby('c1').groups\n",
- "suffixes = df.groupby('c2').groups\n",
- "dprefix = []\n",
- "for prefix, ids in prefixes.items():\n",
- " if len(ids) >= 4:\n",
- " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n",
- "dsuffix = []\n",
- "for suffix, ids in suffixes.items():\n",
- " if len(ids) >= 4:\n",
- " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n",
- "\n",
- ":\n",
- " return True\n",
- " # Then, print everything between .8 and .9 to see whats the best cutoff\n",
- "\"\"\"\n",
"def filter_duplicates(group):\n",
" if not group:\n",
" return []\n",
@@ -440,21 +141,12 @@
},
{
"cell_type": "code",
- "execution_count": 11,
- "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
- "metadata": {},
- "outputs": [],
- "source": [
- "blacklist = ['man', 'men', 'woman', 'women']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 63,
- "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
+ "execution_count": 64,
+ "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
"metadata": {},
"outputs": [],
"source": [
+ "blacklist = ['man', 'men', 'woman', 'women']\n",
"def process_groups(groups):\n",
" new = []\n",
" for group in groups:\n",
@@ -464,16 +156,7 @@
" if len(clues) < 4:\n",
" continue\n",
" new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n",
- " return new"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 64,
- "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9",
- "metadata": {},
- "outputs": [],
- "source": [
+ " return new\n",
"corpus = [\n",
" dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n",
" dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n",
@@ -487,80 +170,61 @@
{
"cell_type": "code",
"execution_count": 65,
- "id": "b35092c5-17db-4257-bf45-83e8c3973da4",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "[Synset('basil_thyme.n.01')]"
- ]
- },
- "execution_count": 65,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "wn.synsets('satureja_acinos')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 66,
- "id": "8025664c-e116-481a-9609-d58200f773ec",
+ "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1",
"metadata": {},
"outputs": [],
"source": [
- "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong"
+ "with open('../static/corpus.js', 'w') as f:\n",
+ " f.write('var corpus = ')\n",
+ " json.dump(corpus, f)"
]
},
{
"cell_type": "code",
- "execution_count": 67,
- "id": "77256802-04ef-4908-9b39-e9381f6abac5",
+ "execution_count": 66,
+ "id": "589f6645-3a52-40ad-9899-717ea7614d00",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "(13824, 3976, 490, 7141, 226, 196)"
+ "'noun.group'"
]
},
- "execution_count": 67,
+ "execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)"
+ "wn.synsets('Islamic_jihad')[0].lexname()"
]
},
{
"cell_type": "code",
- "execution_count": 68,
- "id": "66256c50-fcbc-42ad-a17b-057fd0d7dea1",
+ "execution_count": 67,
+ "id": "8faeb5ee-e1ff-4571-91bf-178cdc7d29f7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "25853"
+ "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'"
]
},
- "execution_count": 68,
+ "execution_count": 67,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "sum((len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)))"
+ " string.ascii_uppercase"
]
},
{
"cell_type": "code",
"execution_count": null,
- "id": "635b6a6c-c084-4584-a63f-cde4221e0ad9",
+ "id": "b8d12e1a-757f-4d87-9374-e7eb656f30d4",
"metadata": {},
"outputs": [],
"source": []
@@ -568,73 +232,31 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "f48e2eec-2ab1-4398-a862-172398c413a0",
+ "id": "68f254d5-7a96-488a-b1a8-6a82dce44271",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
- "execution_count": 70,
- "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1",
- "metadata": {},
- "outputs": [],
- "source": [
- "with open('../static/corpus.js', 'w') as f:\n",
- " f.write('var corpus = ')\n",
- " json.dump(corpus, f)"
- ]
- },
- {
- "cell_type": "code",
"execution_count": null,
- "id": "551ce71f-0d75-4e41-8387-808db1e5e20f",
+ "id": "091e0422-99be-4b20-a20d-17ac296990b6",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
- "execution_count": 38,
- "id": "19589357-f1ca-4d10-8574-3639bd05173f",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "26388"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "id": "dd927be9-a77c-4606-984a-b3cf555b2618",
+ "execution_count": null,
+ "id": "ff46e8c4-b806-43f6-8cf9-d4260bad1ba8",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "195"
- ]
- },
- "execution_count": 39,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
+ "outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
- "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1",
+ "id": "668b25f4-6992-41a3-881c-3a6a72ba0d77",
"metadata": {},
"outputs": [],
"source": []
@@ -642,7 +264,7 @@
{
"cell_type": "code",
"execution_count": null,
- "id": "77cb04f6-846e-454b-98e1-4feb575d2332",
+ "id": "1a0ae159-2a40-4ca5-915a-6ca40df3080f",
"metadata": {},
"outputs": [],
"source": []