From 874d6321c955b2be037ac20d7f6b895d3d53be25 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Thu, 30 Nov 2023 18:35:07 -0500 Subject: ok --- generators/datageneration.ipynb | 317 ++++++++++++++++++++++++++++++++-------- 1 file changed, 252 insertions(+), 65 deletions(-) (limited to 'generators/datageneration.ipynb') diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb index ee65ead..85c6e02 100644 --- a/generators/datageneration.ipynb +++ b/generators/datageneration.ipynb @@ -41,13 +41,18 @@ }, { "cell_type": "code", - "execution_count": 3, - "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1", + "execution_count": 60, + "id": "27488c82-1e9b-4873-9e6b-e0b5937fc51d", "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", - "\n", + "lexnames = set()\n", + "lexname_blacklist = {\n", + " 'noun.plant',\n", + " 'noun.animal',\n", + " 'noun.person'\n", + "}\n", "def groups_for_pos(pos='as'):\n", " dsynonyms = []\n", " for word in wn.words():\n", @@ -55,11 +60,12 @@ " synonymss = wn.synonyms(word)\n", " syns = set()\n", " for synset, synonyms in zip(synsets, synonymss):\n", - " if synset.pos() in pos: # 'as'\n", + " if synset.lexname() in lexname_blacklist:\n", + " continue\n", + " if synset.pos() in pos:\n", " syns |= set(synonyms)\n", " if len(syns) >= 4:\n", " clues = [format(clue) for clue in syns]\n", - " \n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", " \"\"\" \n", @@ -71,8 +77,16 @@ " clues.append(format(word))\n", " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", " \"\"\"\n", - " return dsynonyms\n", - "\n", + " return dsynonyms" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "445789e7-3808-4a28-9df5-9a69313cb4c2", + "metadata": {}, + "outputs": [], + "source": [ "dadj = groups_for_pos('as')\n", "dverb = groups_for_pos('v')\n", "dnoun = groups_for_pos('n')\n", @@ -81,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 30, "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc", "metadata": {}, "outputs": [ @@ -91,7 +105,7 @@ "(3976, 7141, 19563, 490)" ] }, - "execution_count": 4, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -102,20 +116,33 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 62, "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "(3976, 7141, 13824, 490)" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dadj), len(dverb), len(dnoun), len(dadverb)" + ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 33, "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9", "metadata": {}, "outputs": [], "source": [ - "# flag button for reporting" + "# flag button for reporting, definitions in webcollectionscollections" ] }, { @@ -129,7 +156,115 @@ { "cell_type": "code", "execution_count": null, - "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b", + "id": "f4eb0f95-901d-43d8-8ba7-6103b3a0f6be", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "bdcb3a19-a5cf-48e9-ab6f-aaeed50e2c31", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'adj.all',\n", + " 'adj.pert',\n", + " 'adj.ppl',\n", + " 'adv.all',\n", + " 'noun.Tops',\n", + " 'noun.act',\n", + " 'noun.animal',\n", + " 'noun.artifact',\n", + " 'noun.attribute',\n", + " 'noun.body',\n", + " 'noun.cognition',\n", + " 'noun.communication',\n", + " 'noun.event',\n", + " 'noun.feeling',\n", + " 'noun.food',\n", + " 'noun.group',\n", + " 'noun.location',\n", + " 'noun.motive',\n", + " 'noun.object',\n", + " 'noun.person',\n", + " 'noun.phenomenon',\n", + " 'noun.plant',\n", + " 'noun.possession',\n", + " 'noun.process',\n", + " 'noun.quantity',\n", + " 'noun.relation',\n", + " 'noun.shape',\n", + " 'noun.state',\n", + " 'noun.substance',\n", + " 'noun.time',\n", + " 'verb.body',\n", + " 'verb.change',\n", + " 'verb.cognition',\n", + " 'verb.communication',\n", + " 'verb.competition',\n", + " 'verb.consumption',\n", + " 'verb.contact',\n", + " 'verb.creation',\n", + " 'verb.emotion',\n", + " 'verb.motion',\n", + " 'verb.perception',\n", + " 'verb.possession',\n", + " 'verb.social',\n", + " 'verb.stative',\n", + " 'verb.weather'}" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "d3a0f1e9-99e5-4aa2-bd72-d1d200bf2b40", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'noun.plant'" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "syn.lexname()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "64ad6047-a09b-41c3-85b6-65476d8dba0e", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88a4aeab-8123-435e-9b44-3e568102f0b1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6459fdf-fe4e-4dcf-81ef-1688de01be95", "metadata": {}, "outputs": [], "source": [] @@ -194,26 +329,24 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "def43999-d789-4e5c-bb27-4fd29074c875", "metadata": {}, - "outputs": [ - { - "ename": "IndentationError", - "evalue": "unindent does not match any outer indentation level (, line 84)", - "output_type": "error", - "traceback": [ - "\u001b[0;36m File \u001b[0;32m:84\u001b[0;36m\u001b[0m\n\u001b[0;31m r = ratio(a, b)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" - ] - } - ], + "outputs": [], "source": [ "from Levenshtein import ratio\n", "def similar(a, b):\n", " a, b = a.lower(), b.lower()\n", " if len(a) > len(b):\n", " a, b = b, a\n", - " if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b\n", + " if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b:\n", + " return True\n", + " r = ratio(a, b)\n", + " if .8 <= r <= .9:\n", + " pass\n", + " #print(a, b, r)\n", + " return r >= .85\n", + "\"\"\"\n", "Skip to Main\n", "datageneration\n", "Last Checkpoint: 11 days ago\n", @@ -250,7 +383,7 @@ " \n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", - " \"\"\" \n", + " \n", " ok = True\n", " for clue in clues:\n", " if clue in dsynonyms:\n", @@ -258,7 +391,7 @@ " if ok:\n", " clues.append(format(word))\n", " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", - " \"\"\"\n", + " \n", " return dsynonyms\n", "\n", "dadj = groups_for_pos('as')\n", @@ -291,12 +424,7 @@ ":\n", " return True\n", " # Then, print everything between .8 and .9 to see whats the best cutoff\n", - " r = ratio(a, b)\n", - " if .8 <= r <= .9:\n", - " pass\n", - " #print(a, b, r)\n", - " return r >= .85\n", - "\n", + "\"\"\"\n", "def filter_duplicates(group):\n", " if not group:\n", " return []\n", @@ -312,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "6a3c04eb-79a6-47f5-846e-93258db65921", "metadata": {}, "outputs": [], @@ -322,7 +450,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 63, "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483", "metadata": {}, "outputs": [], @@ -341,37 +469,45 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 64, "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", "metadata": {}, "outputs": [], "source": [ - "corpus = dict(\n", - " adjectives=dadj,\n", - " nouns=dnoun,\n", - " adverbs=dadverb,\n", - " verbs=dverb,\n", - " prefixes=dprefix,\n", - " suffixes=dsuffix,\n", - ")\n", - "filtered_corpus = {}\n", - "for k, d in corpus.items():\n", - " filtered_corpus[k] = process_groups(d)" + "corpus = [\n", + " dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n", + " dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n", + " dict(name='verbs', groups=process_groups(dverb), portion=.6),\n", + " dict(name='adverbs', groups=process_groups(dadverb), portion=.54),\n", + " dict(name='nouns', groups=process_groups(dnoun), portion=.2),\n", + " dict(name='adjectives', groups=process_groups(dadj), portion=0),\n", + "]" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 65, "id": "b35092c5-17db-4257-bf45-83e8c3973da4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "[Synset('basil_thyme.n.01')]" + ] + }, + "execution_count": 65, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "filtered_corpus['adverbs']" + "wn.synsets('satureja_acinos')" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 66, "id": "8025664c-e116-481a-9609-d58200f773ec", "metadata": {}, "outputs": [], @@ -379,17 +515,74 @@ "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong" ] }, + { + "cell_type": "code", + "execution_count": 67, + "id": "77256802-04ef-4908-9b39-e9381f6abac5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(13824, 3976, 490, 7141, 226, 196)" + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "id": "66256c50-fcbc-42ad-a17b-057fd0d7dea1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "25853" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sum((len(dnoun), len(dadj), len(dadverb), len(dverb), len(dprefix), len(dsuffix)))" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca", + "id": "635b6a6c-c084-4584-a63f-cde4221e0ad9", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f48e2eec-2ab1-4398-a862-172398c413a0", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1", "metadata": {}, "outputs": [], "source": [ - "\n", "with open('../static/corpus.js', 'w') as f:\n", " f.write('var corpus = ')\n", - " json.dump(filtered_corpus, f)" + " json.dump(corpus, f)" ] }, { @@ -398,9 +591,7 @@ "id": "551ce71f-0d75-4e41-8387-808db1e5e20f", "metadata": {}, "outputs": [], - "source": [ - "similar('slow', 'slowl')" - ] + "source": [] }, { "cell_type": "code", @@ -419,9 +610,7 @@ "output_type": "execute_result" } ], - "source": [ - "len(filtered_corpus)" - ] + "source": [] }, { "cell_type": "code", @@ -440,9 +629,7 @@ "output_type": "execute_result" } ], - "source": [ - "len(dsuffix)" - ] + "source": [] }, { "cell_type": "code", -- cgit v1.2.3