{ "cells": [ { "cell_type": "code", "execution_count": 38, "id": "e1b17564-0abb-41c5-8cf4-7200b014550f", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", "[nltk_data] Package wordnet is already up-to-date!\n", "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", "[nltk_data] Package words is already up-to-date!\n" ] } ], "source": [ "import json\n", "import nltk\n", "from nltk.corpus import wordnet as wn\n", "from nltk.stem.wordnet import WordNetLemmatizer\n", "nltk.download('wordnet')\n", "from nltk.corpus import words\n", "nltk.download('words')\n", "ww = words.words()\n", "import pandas as pd \n", "import string" ] }, { "cell_type": "code", "execution_count": 60, "id": "27488c82-1e9b-4873-9e6b-e0b5937fc51d", "metadata": {}, "outputs": [], "source": [ "def format(s):\n", " return ' '.join(s.split('_'))\n", "from collections import defaultdict\n", "lexnames = set()\n", "lexname_blacklist = {\n", " 'noun.plant',\n", " 'noun.animal',\n", " 'noun.person',\n", " 'noun.state',\n", " 'noun.body',\n", " #'noun.location',\n", " #'noun.group',\n", "}\n", "def is_proper(w):\n", " return w[0] in string.ascii_uppercase\n", "def groups_for_pos(pos='as'):\n", " dsynonyms = []\n", " for word in wn.words():\n", " if is_proper(word):\n", " continue\n", " synsets = wn.synsets(word)\n", " synonymss = wn.synonyms(word)\n", " syns = set()\n", " for synset, synonyms in zip(synsets, synonymss):\n", " if synset.lexname() in lexname_blacklist:\n", " continue\n", " if synset.pos() in pos:\n", " syns |= set(syn for syn in synonyms if not is_proper(syn))\n", " if len(syns) >= 4:\n", " clues = [format(clue) for clue in syns]\n", " clues.append(format(word))\n", " dsynonyms.append(dict(answer=word, hint=f'synonyms for {format(word)}', clues=clues))\n", " return dsynonyms" ] }, { "cell_type": "code", "execution_count": 61, "id": "445789e7-3808-4a28-9df5-9a69313cb4c2", "metadata": {}, "outputs": [], "source": [ "dadj = groups_for_pos('as')\n", "dverb = groups_for_pos('v')\n", "dnoun = groups_for_pos('n')\n", "dadverb = groups_for_pos('r')" ] }, { "cell_type": "code", "execution_count": 62, "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235", "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('LADECv1-2019.csv', index_col=0)\n", "df = df.drop(df[df.correctParse == 'no'].index)\n", "df = df.drop(df[df.isCommonstim == 0].index)\n", "prefixes = df.groupby('c1').groups\n", "suffixes = df.groupby('c2').groups\n", "dprefix = []\n", "for prefix, ids in prefixes.items():\n", " if len(ids) >= 4:\n", " dprefix.append(dict(answer=prefix, hint=f'{prefix} _', clues=list(df.loc[list(ids)].c2)))\n", "dsuffix = []\n", "for suffix, ids in suffixes.items():\n", " if len(ids) >= 4:\n", " dsuffix.append(dict(answer=suffix, hint=f'_ {suffix}', clues=list(df.loc[list(ids)].c1)))" ] }, { "cell_type": "code", "execution_count": 63, "id": "def43999-d789-4e5c-bb27-4fd29074c875", "metadata": {}, "outputs": [], "source": [ "from Levenshtein import ratio\n", "def similar(a, b):\n", " a, b = a.lower(), b.lower()\n", " if len(a) > len(b):\n", " a, b = b, a\n", " if a + 's' == b or a + 'es' == b or a + 'ing' == b or a + 'ly' == b:\n", " return True\n", " r = ratio(a, b)\n", " if .8 <= r <= .9:\n", " pass\n", " #print(a, b, r)\n", " return r >= .85\n", "def filter_duplicates(group):\n", " if not group:\n", " return []\n", " ok = [group[0]]\n", " for i in range(1, len(group)):\n", " for word in ok:\n", " if similar(word, group[i]):\n", " break\n", " else:\n", " ok.append(group[i])\n", " return ok" ] }, { "cell_type": "code", "execution_count": 64, "id": "1c8175f2-817e-45ab-af0b-5ea7ee7a5dc9", "metadata": {}, "outputs": [], "source": [ "blacklist = ['man', 'men', 'woman', 'women']\n", "def process_groups(groups):\n", " new = []\n", " for group in groups:\n", " clues = group['clues']\n", " clues = [clue for clue in clues if clue not in blacklist]\n", " clues = filter_duplicates(clues)\n", " if len(clues) < 4:\n", " continue\n", " new.append(dict(answer=group['answer'], hint=group['hint'], clues=clues))\n", " return new\n", "corpus = [\n", " dict(name='suffixes', groups=process_groups(dsuffix), portion=.9),\n", " dict(name='prefixes', groups=process_groups(dprefix), portion=.8),\n", " dict(name='verbs', groups=process_groups(dverb), portion=.6),\n", " dict(name='adverbs', groups=process_groups(dadverb), portion=.54),\n", " dict(name='nouns', groups=process_groups(dnoun), portion=.2),\n", " dict(name='adjectives', groups=process_groups(dadj), portion=0),\n", "]" ] }, { "cell_type": "code", "execution_count": 65, "id": "1f2cca38-5d1a-421e-9bcc-09d68880c6f1", "metadata": {}, "outputs": [], "source": [ "with open('../static/corpus.js', 'w') as f:\n", " f.write('var corpus = ')\n", " json.dump(corpus, f)" ] }, { "cell_type": "code", "execution_count": 66, "id": "589f6645-3a52-40ad-9899-717ea7614d00", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'noun.group'" ] }, "execution_count": 66, "metadata": {}, "output_type": "execute_result" } ], "source": [ "wn.synsets('Islamic_jihad')[0].lexname()" ] }, { "cell_type": "code", "execution_count": 67, "id": "8faeb5ee-e1ff-4571-91bf-178cdc7d29f7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ABCDEFGHIJKLMNOPQRSTUVWXYZ'" ] }, "execution_count": 67, "metadata": {}, "output_type": "execute_result" } ], "source": [ " string.ascii_uppercase" ] }, { "cell_type": "code", "execution_count": null, "id": "b8d12e1a-757f-4d87-9374-e7eb656f30d4", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "68f254d5-7a96-488a-b1a8-6a82dce44271", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "091e0422-99be-4b20-a20d-17ac296990b6", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "ff46e8c4-b806-43f6-8cf9-d4260bad1ba8", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "668b25f4-6992-41a3-881c-3a6a72ba0d77", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "1a0ae159-2a40-4ca5-915a-6ca40df3080f", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 5 }