{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "e1b17564-0abb-41c5-8cf4-7200b014550f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] } ], "source": [ "import json\n", "import nltk\n", "from nltk.corpus import wordnet as wn\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "nltk.download('wordnet')\n", "from nltk.corpus import words\n", "nltk.download('words')\n", "ww = words.words()" ] }, { "cell_type": "code", "execution_count": 2, "id": "8fe45bc7-a41a-49db-9067-700254f388c0", "metadata": {}, "outputs": [], "source": [ "def format(s):\n", " return ' '.join(s.split('_'))" ] }, { "cell_type": "code", "execution_count": 3, "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1", "metadata": {}, "outputs": [], "source": [ "from collections import defaultdict\n", "\n", "def groups_for_pos(pos='as'):\n", " dsynonyms = []\n", " for word in wn.words():\n", " synsets = wn.synsets(word)\n", " synonymss = wn.synonyms(word)\n", " syns = set()\n", " for synset, synonyms in zip(synsets, synonymss):\n", " if synset.pos() in pos: # 'as'\n", " syns |= set(synonyms)\n", " if len(syns) >= 4:\n", " clues = [format(clue) for clue in syns]\n", " \n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", " \"\"\" \n", " ok = True\n", " for clue in clues:\n", " if clue in dsynonyms:\n", " ok = False\n", " if ok:\n", " clues.append(format(word))\n", " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", " \"\"\"\n", " return dsynonyms\n", "\n", "dadj = groups_for_pos('as')\n", "dverb = groups_for_pos('v')\n", "dnoun = groups_for_pos('n')\n", "dadverb = groups_for_pos('r')" ] }, { "cell_type": "code", "execution_count": 4, "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(3976, 7141, 19563, 490)" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dadj), len(dverb), len(dnoun), len(dadverb)" ] }, { "cell_type": "code", "execution_count": null, "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 5, "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9", "metadata": {}, "outputs": [], "source": [ "# flag button for reporting" ] }, { "cell_type": "code", "execution_count": null, "id": "1464f8df-180a-4334-b123-d76303140a03", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "e588bdf3-d648-48b3-ab6b-027a07194292", "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 7, "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('LADECv1-2019.csv', index_col=0)" ] }, { "cell_type": "code", "execution_count": 8, "id": "176e2790-560c-4daf-b436-a1771611c4bf", "metadata": {}, "outputs": [], "source": [ "df = df.drop(df[df.correctParse == 'no'].index)\n", "df = df.drop(df[df.isCommonstim == 0].index)" ] }, { "cell_type": "code", "execution_count": 9, "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235", "metadata": {}, "outputs": [], "source": [ "prefixes = df.groupby('c1').groups\n", "suffixes = df.groupby('c2').groups\n", "dprefix = []\n", "for prefix, ids in prefixes.items():\n", " if len(ids) >= 4:\n", " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", "dsuffix = []\n", "for suffix, ids in suffixes.items():\n", " if len(ids) >= 4:\n", " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))" ] }, { "cell_type": "code", "execution_count": null, "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "def43999-d789-4e5c-bb27-4fd29074c875", "metadata": {}, "outputs": [ { "ename": "IndentationError", "evalue": "unindent does not match any outer indentation level (, line 84)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m:84\u001b[0;36m\u001b[0m\n\u001b[0;31m r = ratio(a, b)\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unindent does not match any outer indentation level\n" ] } ], "source": [ "from Levenshtein import ratio\n", "def similar(a, b):\n", " a, b = a.lower(), b.lower()\n", " if len(a) > len(b):\n", " a, b = b, a\n", " if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b\n", "Skip to Main\n", "datageneration\n", "Last Checkpoint: 11 days ago\n", "[Python 3 (ipykernel)]\n", "import json\n", "import nltk\n", "from nltk.corpus import wordnet as wn\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "nltk.download('wordnet')\n", "from nltk.corpus import words\n", "nltk.download('words')\n", "ww = words.words()\n", "\n", "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n", "\n", "def format(s):\n", " return ' '.join(s.split('_'))\n", "from collections import defaultdict\n", "\n", "def groups_for_pos(pos='as'):\n", " dsynonyms = []\n", " for word in wn.words():\n", " synsets = wn.synsets(word)\n", " synonymss = wn.synonyms(word)\n", " syns = set()\n", " for synset, synonyms in zip(synsets, synonymss):\n", " if synset.pos() in pos: # 'as'\n", " syns |= set(synonyms)\n", " if len(syns) >= 4:\n", " clues = [format(clue) for clue in syns]\n", " \n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", " \"\"\" \n", " ok = True\n", " for clue in clues:\n", " if clue in dsynonyms:\n", " ok = False\n", " if ok:\n", " clues.append(format(word))\n", " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)\n", " \"\"\"\n", " return dsynonyms\n", "\n", "dadj = groups_for_pos('as')\n", "dverb = groups_for_pos('v')\n", "dnoun = groups_for_pos('n')\n", "dadverb = groups_for_pos('r')\n", "len(dadj), len(dverb), len(dnoun), len(dadverb)\n", "\n", "(3976, 7141, 19563, 490)\n", "\n", "\n", "# flag button for reporting\n", "\n", "\n", "import pandas as pd\n", "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n", "df = df.drop(df[df.correctParse == 'no'].index)\n", "df = df.drop(df[df.isCommonstim == 0].index)\n", "prefixes = df.groupby('c1').groups\n", "suffixes = df.groupby('c2').groups\n", "dprefix = []\n", "for prefix, ids in prefixes.items():\n", " if len(ids) >= 4:\n", " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", "dsuffix = []\n", "for suffix, ids in suffixes.items():\n", " if len(ids) >= 4:\n", " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))\n", "\n", ":\n", " return True\n", " # Then, print everything between .8 and .9 to see whats the best cutoff\n", " r = ratio(a, b)\n", " if .8 <= r <= .9:\n", " pass\n", " #print(a, b, r)\n", " return r >= .85\n", "\n", "def filter_duplicates(group):\n", " if not group:\n", " return []\n", " ok = [group[0]]\n", " for i in range(1, len(group)):\n", " for word in ok:\n", " if similar(word, group[i]):\n", " break\n", " else:\n", " ok.append(group[i])\n", " return ok" ] }, { "cell_type": "code", "execution_count": null, "id": "6a3c04eb-79a6-47f5-846e-93258db65921", "metadata": {}, "outputs": [], "source": [ "blacklist = ['man', 'men', 'woman', 'women']" ] }, { "cell_type": "code", "execution_count": null, "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483", "metadata": {}, "outputs": [], "source": [ "def process_groups(groups):\n", " new = []\n", " for group in groups:\n", " clues = group['clues']\n", " clues = [clue for clue in clues if clue not in blacklist]\n", " clues = filter_duplicates(clues)\n", " if len(clues) < 4:\n", " continue\n", " new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n", " return new" ] }, { "cell_type": "code", "execution_count": null, "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", "metadata": {}, "outputs": [], "source": [ "corpus = dict(\n", " adjectives=dadj,\n", " nouns=dnoun,\n", " adverbs=dadverb,\n", " verbs=dverb,\n", " prefixes=dprefix,\n", " suffixes=dsuffix,\n", ")\n", "filtered_corpus = {}\n", "for k, d in corpus.items():\n", " filtered_corpus[k] = process_groups(d)" ] }, { "cell_type": "code", "execution_count": null, "id": "b35092c5-17db-4257-bf45-83e8c3973da4", "metadata": {}, "outputs": [], "source": [ "filtered_corpus['adverbs']" ] }, { "cell_type": "code", "execution_count": null, "id": "8025664c-e116-481a-9609-d58200f773ec", "metadata": {}, "outputs": [], "source": [ "# Can lemmatize/singularize and compare levenshtein based on that, but don't use lemma itself for the group as it may bed wrong" ] }, { "cell_type": "code", "execution_count": null, "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca", "metadata": {}, "outputs": [], "source": [ "\n", "with open('../static/corpus.js', 'w') as f:\n", " f.write('var corpus = ')\n", " json.dump(filtered_corpus, f)" ] }, { "cell_type": "code", "execution_count": null, "id": "551ce71f-0d75-4e41-8387-808db1e5e20f", "metadata": {}, "outputs": [], "source": [ "similar('slow', 'slowl')" ] }, { "cell_type": "code", "execution_count": 38, "id": "19589357-f1ca-4d10-8574-3639bd05173f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "26388" ] }, "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(filtered_corpus)" ] }, { "cell_type": "code", "execution_count": 39, "id": "dd927be9-a77c-4606-984a-b3cf555b2618", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "195" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(dsuffix)" ] }, { "cell_type": "code", "execution_count": null, "id": "f598354d-3f52-4952-a8c0-69c480ebe8b1", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "77cb04f6-846e-454b-98e1-4feb575d2332", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }