From d3249bb81f01f607054b2fa8d34deb0d77402b31 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Thu, 30 Nov 2023 17:08:40 -0500 Subject: upd --- generators/datageneration.ipynb | 552 ++++++++++++++-------------------------- 1 file changed, 194 insertions(+), 358 deletions(-) (limited to 'generators') diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb index 457cc1d..ee65ead 100644 --- a/generators/datageneration.ipynb +++ b/generators/datageneration.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 59, + "execution_count": 1, "id": "e1b17564-0abb-41c5-8cf4-7200b014550f", "metadata": {}, "outputs": [ @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 2, "id": "8fe45bc7-a41a-49db-9067-700254f388c0", "metadata": {}, "outputs": [], @@ -41,53 +41,63 @@ }, { "cell_type": "code", - "execution_count": 229, + "execution_count": 3, "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1", "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", - "dsynonyms = defaultdict(set)\n", - "n=0\n", "\n", - "for word in wn.words():\n", - " n+=1\n", - " synsets = wn.synsets(word)\n", - " synonymss = wn.synonyms(word)\n", - " syns = set()\n", - " for synset, synonyms in zip(synsets, synonymss):\n", - " if synset.pos() in ['a', 's']:\n", - " syns |= set(synonyms)\n", - " if len(syns) >= 4:\n", - " clues = [format(clue) for clue in syns]\n", - " ok = True\n", - " for clue in clues:\n", - " if clue in dsynonyms:\n", - " ok = False\n", - " if ok:\n", + "def groups_for_pos(pos='as'):\n", + " dsynonyms = []\n", + " for word in wn.words():\n", + " synsets = wn.synsets(word)\n", + " synonymss = wn.synonyms(word)\n", + " syns = set()\n", + " for synset, synonyms in zip(synsets, synonymss):\n", + " if synset.pos() in pos: # 'as'\n", + " syns |= set(synonyms)\n", + " if len(syns) >= 4:\n", + " clues = [format(clue) for clue in syns]\n", + " \n", " clues.append(format(word))\n", - " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)" + " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", + " \"\"\" \n", + " ok = True\n", + " for clue in clues:\n", + " if clue in dsynonyms:\n", + " ok = False\n", + " if ok:\n", + " clues.append(format(word))\n", + " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", + " \"\"\"\n", + " return dsynonyms\n", + "\n", + "dadj = groups_for_pos('as')\n", + "dverb = groups_for_pos('v')\n", + "dnoun = groups_for_pos('n')\n", + "dadverb = groups_for_pos('r')" ] }, { "cell_type": "code", - "execution_count": 230, + "execution_count": 4, "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Synset('spanish_lime.n.01'), Synset('genip.n.02')]" + "(3976, 7141, 19563, 490)" ] }, - "execution_count": 230, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "wn.synsets('genip')" + "len(dadj), len(dverb), len(dnoun), len(dadverb)" ] }, { @@ -100,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 231, + "execution_count": 5, "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9", "metadata": {}, "outputs": [], @@ -126,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 232, + "execution_count": 6, "id": "e588bdf3-d648-48b3-ab6b-027a07194292", "metadata": {}, "outputs": [], @@ -136,7 +146,7 @@ }, { "cell_type": "code", - "execution_count": 243, + "execution_count": 7, "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", "metadata": {}, "outputs": [], @@ -146,78 +156,146 @@ }, { "cell_type": "code", - "execution_count": 244, + "execution_count": 8, "id": "176e2790-560c-4daf-b436-a1771611c4bf", "metadata": {}, "outputs": [], "source": [ "df = df.drop(df[df.correctParse == 'no'].index)\n", - "df = df.drop(df[df.isCommonstim == 'no'].index)" + "df = df.drop(df[df.isCommonstim == 0].index)" ] }, { "cell_type": "code", - "execution_count": 235, + "execution_count": 9, "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235", "metadata": {}, "outputs": [], "source": [ "prefixes = df.groupby('c1').groups\n", "suffixes = df.groupby('c2').groups\n", - "pres = []\n", + "dprefix = []\n", "for prefix, ids in prefixes.items():\n", " if len(ids) >= 4:\n", - " pres.append((prefix, list(df.loc[list(ids)].c2)))\n", - "sufs = []\n", + " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", + "dsuffix = []\n", "for suffix, ids in suffixes.items():\n", " if len(ids) >= 4:\n", - " sufs.append((suffix, list(df.loc[list(ids)].c1)))" + " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))" ] }, { "cell_type": "code", - "execution_count": 236, + "execution_count": null, "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586", "metadata": {}, "outputs": [], - "source": [ - "dprefix = {}\n", - "for prefix, ids in pres:\n", - " res = set()\n", - " for id in ids:\n", - " if (id[-1] == 's' and id[:-1] in ids) or (ids[-2:] == 'es' and ids[:-2] in ids):\n", - " continue\n", - " res.add(id)\n", - " if len(res) < 4:\n", - " continue\n", - " dprefix[prefix] = dict(group=f'{prefix} _', clues=list(res))\n", - "\n", - "dsuffix = {}\n", - "for suffix, ids in sufs:\n", - " if (suffix[-1] == 's' and suffix[:-1] in dsuffix) or (suffix[-2:] == 'es' and suffix[:-2] in ids):\n", - " #dsuffix[suffix[:-1]] = set(ids)\n", - " continue\n", - " if len(ids) < 4:\n", - " continue\n", - " dsuffix[suffix] = dict(group=f'_ {suffix}', clues=ids)" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 237, + "execution_count": 11, "id": "def43999-d789-4e5c-bb27-4fd29074c875", "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "IndentationError", + "evalue": "unindent does not match any outer indentation level (, line 84)", + "output_type": "error", + "traceback": [ + "\u001b[0;36m File \u001b[0;32m:84\u001b[0;36m\u001b[0m\n\u001b[0;31m r = ratio(a, b)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" + ] + } + ], "source": [ "from Levenshtein import ratio\n", "def similar(a, b):\n", - " return ratio(a, b) >= .8\n", - "import inflect\n", + " a, b = a.lower(), b.lower()\n", + " if len(a) > len(b):\n", + " a, b = b, a\n", + " if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b\n", + "Skip to Main\n", + "datageneration\n", + "Last Checkpoint: 11 days ago\n", + "[Python 3 (ipykernel)]\n", + "import json\n", + "import nltk\n", + "from nltk.corpus import wordnet as wn\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "nltk.download('wordnet')\n", + "from nltk.corpus import words\n", + "nltk.download('words')\n", + "ww = words.words()\n", "\n", - "p = inflect.engine()\n", + "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", + "[nltk_data] Package words is already up-to-date!\n", "\n", - "def normalize(w):\n", - " pass\n", + "def format(s):\n", + " return ' '.join(s.split('_'))\n", + "from collections import defaultdict\n", + "\n", + "def groups_for_pos(pos='as'):\n", + " dsynonyms = []\n", + " for word in wn.words():\n", + " synsets = wn.synsets(word)\n", + " synonymss = wn.synonyms(word)\n", + " syns = set()\n", + " for synset, synonyms in zip(synsets, synonymss):\n", + " if synset.pos() in pos: # 'as'\n", + " syns |= set(synonyms)\n", + " if len(syns) >= 4:\n", + " clues = [format(clue) for clue in syns]\n", + " \n", + " clues.append(format(word))\n", + " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", + " \"\"\" \n", + " ok = True\n", + " for clue in clues:\n", + " if clue in dsynonyms:\n", + " ok = False\n", + " if ok:\n", + " clues.append(format(word))\n", + " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", + " \"\"\"\n", + " return dsynonyms\n", + "\n", + "dadj = groups_for_pos('as')\n", + "dverb = groups_for_pos('v')\n", + "dnoun = groups_for_pos('n')\n", + "dadverb = groups_for_pos('r')\n", + "len(dadj), len(dverb), len(dnoun), len(dadverb)\n", + "\n", + "(3976, 7141, 19563, 490)\n", + "\n", + "\n", + "# flag button for reporting\n", + "\n", + "\n", + "import pandas as pd\n", + "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n", + "df = df.drop(df[df.correctParse == 'no'].index)\n", + "df = df.drop(df[df.isCommonstim == 0].index)\n", + "prefixes = df.groupby('c1').groups\n", + "suffixes = df.groupby('c2').groups\n", + "dprefix = []\n", + "for prefix, ids in prefixes.items():\n", + " if len(ids) >= 4:\n", + " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", + "dsuffix = []\n", + "for suffix, ids in suffixes.items():\n", + " if len(ids) >= 4:\n", + " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n", + "\n", + ":\n", + " return True\n", + " # Then, print everything between .8 and .9 to see whats the best cutoff\n", + " r = ratio(a, b)\n", + " if .8 <= r <= .9:\n", + " pass\n", + " #print(a, b, r)\n", + " return r >= .85\n", "\n", "def filter_duplicates(group):\n", " if not group:\n", @@ -234,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 238, + "execution_count": null, "id": "6a3c04eb-79a6-47f5-846e-93258db65921", "metadata": {}, "outputs": [], @@ -244,58 +322,66 @@ }, { "cell_type": "code", - "execution_count": 239, + "execution_count": null, "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483", "metadata": {}, "outputs": [], "source": [ - "\n", - "def process_corpus(corpus):\n", - " new = {}\n", - " for word, group in corpus.items():\n", + "def process_groups(groups):\n", + " new = []\n", + " for group in groups:\n", " clues = group['clues']\n", " clues = [clue for clue in clues if clue not in blacklist]\n", " clues = filter_duplicates(clues)\n", " if len(clues) < 4:\n", " continue\n", - " new[word] = dict(group=group['group'], clues=clues)\n", + " new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n", " return new" ] }, { "cell_type": "code", - "execution_count": 240, - "id": "a59a4514-2572-4d35-a73d-fef58d1bc804", + "execution_count": null, + "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = dict(\n", + " adjectives=dadj,\n", + " nouns=dnoun,\n", + " adverbs=dadverb,\n", + " verbs=dverb,\n", + " prefixes=dprefix,\n", + " suffixes=dsuffix,\n", + ")\n", + "filtered_corpus = {}\n", + "for k, d in corpus.items():\n", + " filtered_corpus[k] = process_groups(d)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b35092c5-17db-4257-bf45-83e8c3973da4", "metadata": {}, "outputs": [], "source": [ - "corpus = {**dprefix}\n", - "corpus.update(dsuffix)\n", - "corpus.update(dsynonyms)\n", - "filtered_corpus = process_corpus(corpus)" + "filtered_corpus['adverbs']" ] }, { "cell_type": "code", - "execution_count": 259, + "execution_count": null, "id": "8025664c-e116-481a-9609-d58200f773ec", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "437 330\n" - ] - } - ], + "outputs": [], "source": [ - "print(len(dprefix), len(dsuffix))" + "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong" ] }, { "cell_type": "code", - "execution_count": 241, + "execution_count": null, "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca", "metadata": {}, "outputs": [], @@ -309,317 +395,67 @@ { "cell_type": "code", "execution_count": null, - "id": "4a82df07-568a-41f9-98c9-be0182522577", + "id": "551ce71f-0d75-4e41-8387-808db1e5e20f", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "similar('slow', 'slowl')" + ] }, { "cell_type": "code", - "execution_count": 242, - "id": "46157b29-1084-4caa-be4f-7c56be562da8", + "execution_count": 38, + "id": "19589357-f1ca-4d10-8574-3639bd05173f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[['encroach', 'impinge', 'infringe'],\n", - " ['encroach', 'entrench', 'impinge', 'trench'],\n", - " ['invasive', 'trespassing']]" + "26388" ] }, - "execution_count": 242, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "wn.synonyms('encroaching')" + "len(filtered_corpus)" ] }, { "cell_type": "code", - "execution_count": 252, - "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829", + "execution_count": 39, + "id": "dd927be9-a77c-4606-984a-b3cf555b2618", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
c1c2stimisCommonC1isCommonC2isCommonstim
id_master
3237gadaboutgadabout111
4592knockaboutknockabout111
8231turnaboutturnabout111
6139raceaboutraceabout110
8331walkaboutwalkabout111
.....................
4515junkyardsjunkyards100
6812shipyardsshipyards100
2667farmyardsfarmyards100
1007brickyardsbrickyards100
8892zigzagzigzag001
\n", - "

8372 rows × 6 columns

\n", - "
" - ], "text/plain": [ - " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", - "id_master \n", - "3237 gad about gadabout 1 1 1\n", - "4592 knock about knockabout 1 1 1\n", - "8231 turn about turnabout 1 1 1\n", - "6139 race about raceabout 1 1 0\n", - "8331 walk about walkabout 1 1 1\n", - "... ... ... ... ... ... ...\n", - "4515 junk yards junkyards 1 0 0\n", - "6812 ship yards shipyards 1 0 0\n", - "2667 farm yards farmyards 1 0 0\n", - "1007 brick yards brickyards 1 0 0\n", - "8892 zig zag zigzag 0 0 1\n", - "\n", - "[8372 rows x 6 columns]" + "195" ] }, - "execution_count": 252, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" + "len(dsuffix)" ] }, { "cell_type": "code", - "execution_count": 258, - "id": "ebcdf335-02c3-480c-a241-f83f7569acb0", + "execution_count": null, + "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
c1c2stimisCommonC1isCommonC2isCommonstim
id_master
8361warfarewarfare111
2715fieldfarefieldfare110
1298carfarecarfare111
51airfareairfare111
\n", - "
" - ], - "text/plain": [ - " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", - "id_master \n", - "8361 war fare warfare 1 1 1\n", - "2715 field fare fieldfare 1 1 0\n", - "1298 car fare carfare 1 1 1\n", - "51 air fare airfare 1 1 1" - ] - }, - "execution_count": 258, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.c2=='fare'][['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" - ] + "outputs": [], + "source": [] }, { "cell_type": "code", "execution_count": null, - "id": "50989f8d-368e-4b4d-ab6c-355efce36c93", + "id": "77cb04f6-846e-454b-98e1-4feb575d2332", "metadata": {}, "outputs": [], "source": [] -- cgit v1.2.3