From 27f01fe7ab0e4264b76539f95c5d9c178ebd77c4 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Thu, 30 Nov 2023 18:47:54 -0500 Subject: new words --- generators/datageneration.ipynb | 470 ++++------------------------------------ 1 file changed, 46 insertions(+), 424 deletions(-) (limited to 'generators/datageneration.ipynb') diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb index 85c6e02..9bb1407 100644 --- a/generators/datageneration.ipynb +++ b/generators/datageneration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 38, "id": "e1b17564-0abb-41c5-8cf4-7200b014550f", "metadata": {}, "outputs": [ @@ -25,18 +25,9 @@ "nltk.download('wordnet')\n", "from nltk.corpus import words\n", "nltk.download('words')\n", - "ww = words.words()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "8fe45bc7-a41a-49db-9067-700254f388c0", - "metadata": {}, - "outputs": [], - "source": [ - "def format(s):\n", - " return ' '.join(s.split('_'))" + "ww = words.words()\n", + "import pandas as pd \n", + "import string" ] }, { @@ -46,16 +37,26 @@ "metadata": {}, "outputs": [], "source": [ + "def format(s):\n", + " return ' '.join(s.split('_'))\n", "from collections import defaultdict\n", "lexnames = set()\n", "lexname_blacklist = {\n", " 'noun.plant',\n", " 'noun.animal',\n", - " 'noun.person'\n", + " 'noun.person',\n", + " 'noun.state',\n", + " 'noun.body',\n", + " #'noun.location',\n", + " #'noun.group',\n", "}\n", + "def is_proper(w):\n", + " return w[0] in string.ascii_uppercase\n", "def groups_for_pos(pos='as'):\n", " dsynonyms = []\n", " for word in wn.words():\n", + " if is_proper(word):\n", + " continue\n", " synsets = wn.synsets(word)\n", " synonymss = wn.synonyms(word)\n", " syns = set()\n", @@ -63,20 +64,11 @@ " if synset.lexname() in lexname_blacklist:\n", " continue\n", " if synset.pos() in pos:\n", - " syns |= set(synonyms)\n", + " syns |= set(syn for syn in synonyms if not is_proper(syn))\n", " if len(syns) >= 4:\n", " clues = [format(clue) for clue in syns]\n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", - " \"\"\" \n", - " ok = True\n", - " for clue in clues:\n", - " if clue in dsynonyms:\n", - " ok = False\n", - " if ok:\n", - " clues.append(format(word))\n", - " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", - " \"\"\"\n", " return dsynonyms" ] }, @@ -93,220 +85,16 @@ "dadverb = groups_for_pos('r')" ] }, - { - "cell_type": "code", - "execution_count": 30, - "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3976, 7141, 19563, 490)" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(dadj), len(dverb), len(dnoun), len(dadverb)" - ] - }, { "cell_type": "code", "execution_count": 62, - "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(3976, 7141, 13824, 490)" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(dadj), len(dverb), len(dnoun), len(dadverb)" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9", - "metadata": {}, - "outputs": [], - "source": [ - "# flag button for reporting, definitions in webcollectionscollections" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1464f8df-180a-4334-b123-d76303140a03", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4eb0f95-901d-43d8-8ba7-6103b3a0f6be", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "bdcb3a19-a5cf-48e9-ab6f-aaeed50e2c31", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'adj.all',\n", - " 'adj.pert',\n", - " 'adj.ppl',\n", - " 'adv.all',\n", - " 'noun.Tops',\n", - " 'noun.act',\n", - " 'noun.animal',\n", - " 'noun.artifact',\n", - " 'noun.attribute',\n", - " 'noun.body',\n", - " 'noun.cognition',\n", - " 'noun.communication',\n", - " 'noun.event',\n", - " 'noun.feeling',\n", - " 'noun.food',\n", - " 'noun.group',\n", - " 'noun.location',\n", - " 'noun.motive',\n", - " 'noun.object',\n", - " 'noun.person',\n", - " 'noun.phenomenon',\n", - " 'noun.plant',\n", - " 'noun.possession',\n", - " 'noun.process',\n", - " 'noun.quantity',\n", - " 'noun.relation',\n", - " 'noun.shape',\n", - " 'noun.state',\n", - " 'noun.substance',\n", - " 'noun.time',\n", - " 'verb.body',\n", - " 'verb.change',\n", - " 'verb.cognition',\n", - " 'verb.communication',\n", - " 'verb.competition',\n", - " 'verb.consumption',\n", - " 'verb.contact',\n", - " 'verb.creation',\n", - " 'verb.emotion',\n", - " 'verb.motion',\n", - " 'verb.perception',\n", - " 'verb.possession',\n", - " 'verb.social',\n", - " 'verb.stative',\n", - " 'verb.weather'}" - ] - }, - "execution_count": 57, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 50, - "id": "d3a0f1e9-99e5-4aa2-bd72-d1d200bf2b40", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'noun.plant'" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "syn.lexname()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "64ad6047-a09b-41c3-85b6-65476d8dba0e", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "88a4aeab-8123-435e-9b44-3e568102f0b1", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6459fdf-fe4e-4dcf-81ef-1688de01be95", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "e588bdf3-d648-48b3-ab6b-027a07194292", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('LADECv1-2019.csv', index_col=0)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "176e2790-560c-4daf-b436-a1771611c4bf", - "metadata": {}, - "outputs": [], - "source": [ - "df = df.drop(df[df.correctParse == 'no'].index)\n", - "df = df.drop(df[df.isCommonstim == 0].index)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235", "metadata": {}, "outputs": [], "source": [ + "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n", + "df = df.drop(df[df.correctParse == 'no'].index)\n", + "df = df.drop(df[df.isCommonstim == 0].index)\n", "prefixes = df.groupby('c1').groups\n", "suffixes = df.groupby('c2').groups\n", "dprefix = []\n", @@ -321,15 +109,7 @@ }, { "cell_type": "code", - "execution_count": null, - "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 10, + "execution_count": 63, "id": "def43999-d789-4e5c-bb27-4fd29074c875", "metadata": {}, "outputs": [], @@ -346,85 +126,6 @@ " pass\n", " #print(a, b, r)\n", " return r >= .85\n", - "\"\"\"\n", - "Skip to Main\n", - "datageneration\n", - "Last Checkpoint: 11 days ago\n", - "[Python 3 (ipykernel)]\n", - "import json\n", - "import nltk\n", - "from nltk.corpus import wordnet as wn\n", - "from nltk.stem.wordnet import WordNetLemmatizer\n", - "nltk.download('wordnet')\n", - "from nltk.corpus import words\n", - "nltk.download('words')\n", - "ww = words.words()\n", - "\n", - "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", - "[nltk_data] Package wordnet is already up-to-date!\n", - "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", - "[nltk_data] Package words is already up-to-date!\n", - "\n", - "def format(s):\n", - " return ' '.join(s.split('_'))\n", - "from collections import defaultdict\n", - "\n", - "def groups_for_pos(pos='as'):\n", - " dsynonyms = []\n", - " for word in wn.words():\n", - " synsets = wn.synsets(word)\n", - " synonymss = wn.synonyms(word)\n", - " syns = set()\n", - " for synset, synonyms in zip(synsets, synonymss):\n", - " if synset.pos() in pos: # 'as'\n", - " syns |= set(synonyms)\n", - " if len(syns) >= 4:\n", - " clues = [format(clue) for clue in syns]\n", - " \n", - " clues.append(format(word))\n", - " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", - " \n", - " ok = True\n", - " for clue in clues:\n", - " if clue in dsynonyms:\n", - " ok = False\n", - " if ok:\n", - " clues.append(format(word))\n", - " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", - " \n", - " return dsynonyms\n", - "\n", - "dadj = groups_for_pos('as')\n", - "dverb = groups_for_pos('v')\n", - "dnoun = groups_for_pos('n')\n", - "dadverb = groups_for_pos('r')\n", - "len(dadj), len(dverb), len(dnoun), len(dadverb)\n", - "\n", - "(3976, 7141, 19563, 490)\n", - "\n", - "\n", - "# flag button for reporting\n", - "\n", - "\n", - "import pandas as pd\n", - "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n", - "df = df.drop(df[df.correctParse == 'no'].index)\n", - "df = df.drop(df[df.isCommonstim == 0].index)\n", - "prefixes = df.groupby('c1').groups\n", - "suffixes = df.groupby('c2').groups\n", - "dprefix = []\n", - "for prefix, ids in prefixes.items():\n", - " if len(ids) >= 4:\n", - " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", - "dsuffix = []\n", - "for suffix, ids in suffixes.items():\n", - " if len(ids) >= 4:\n", - " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n", - "\n", - ":\n", - " return True\n", - " # Then, print everything between .8 and .9 to see whats the best cutoff\n", - "\"\"\"\n", "def filter_duplicates(group):\n", " if not group:\n", " return []\n", @@ -440,21 +141,12 @@ }, { "cell_type": "code", - "execution_count": 11, - "id": "6a3c04eb-79a6-47f5-846e-93258db65921", - "metadata": {}, - "outputs": [], - "source": [ - "blacklist = ['man', 'men', 'woman', 'women']" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483", + "execution_count": 64, + "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", "metadata": {}, "outputs": [], "source": [ + "blacklist = ['man', 'men', 'woman', 'women']\n", "def process_groups(groups):\n", " new = []\n", " for group in groups:\n", @@ -464,16 +156,7 @@ " if len(clues) < 4:\n", " continue\n", " new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n", - " return new" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", - "metadata": {}, - "outputs": [], - "source": [ + " return new\n", "corpus = [\n", " dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n", " dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n", @@ -487,80 +170,61 @@ { "cell_type": "code", "execution_count": 65, - "id": "b35092c5-17db-4257-bf45-83e8c3973da4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "[Synset('basil_thyme.n.01')]" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "wn.synsets('satureja_acinos')" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "8025664c-e116-481a-9609-d58200f773ec", + "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1", "metadata": {}, "outputs": [], "source": [ - "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong" + "with open('../static/corpus.js', 'w') as f:\n", + " f.write('var corpus = ')\n", + " json.dump(corpus, f)" ] }, { "cell_type": "code", - "execution_count": 67, - "id": "77256802-04ef-4908-9b39-e9381f6abac5", + "execution_count": 66, + "id": "589f6645-3a52-40ad-9899-717ea7614d00", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(13824, 3976, 490, 7141, 226, 196)" + "'noun.group'" ] }, - "execution_count": 67, + "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)" + "wn.synsets('Islamic_jihad')[0].lexname()" ] }, { "cell_type": "code", - "execution_count": 68, - "id": "66256c50-fcbc-42ad-a17b-057fd0d7dea1", + "execution_count": 67, + "id": "8faeb5ee-e1ff-4571-91bf-178cdc7d29f7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "25853" + "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'" ] }, - "execution_count": 68, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sum((len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)))" + " string.ascii_uppercase" ] }, { "cell_type": "code", "execution_count": null, - "id": "635b6a6c-c084-4584-a63f-cde4221e0ad9", + "id": "b8d12e1a-757f-4d87-9374-e7eb656f30d4", "metadata": {}, "outputs": [], "source": [] @@ -568,73 +232,31 @@ { "cell_type": "code", "execution_count": null, - "id": "f48e2eec-2ab1-4398-a862-172398c413a0", + "id": "68f254d5-7a96-488a-b1a8-6a82dce44271", "metadata": {}, "outputs": [], "source": [] }, - { - "cell_type": "code", - "execution_count": 70, - "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1", - "metadata": {}, - "outputs": [], - "source": [ - "with open('../static/corpus.js', 'w') as f:\n", - " f.write('var corpus = ')\n", - " json.dump(corpus, f)" - ] - }, { "cell_type": "code", "execution_count": null, - "id": "551ce71f-0d75-4e41-8387-808db1e5e20f", + "id": "091e0422-99be-4b20-a20d-17ac296990b6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 38, - "id": "19589357-f1ca-4d10-8574-3639bd05173f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "26388" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "dd927be9-a77c-4606-984a-b3cf555b2618", + "execution_count": null, + "id": "ff46e8c4-b806-43f6-8cf9-d4260bad1ba8", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "195" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, - "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1", + "id": "668b25f4-6992-41a3-881c-3a6a72ba0d77", "metadata": {}, "outputs": [], "source": [] @@ -642,7 +264,7 @@ { "cell_type": "code", "execution_count": null, - "id": "77cb04f6-846e-454b-98e1-4feb575d2332", + "id": "1a0ae159-2a40-4ca5-915a-6ca40df3080f", "metadata": {}, "outputs": [], "source": [] -- cgit v1.2.3