summaryrefslogtreecommitdiff
path: root/generators/datageneration.ipynb
diff options
context:
space:
mode:
Diffstat (limited to 'generators/datageneration.ipynb')
-rw-r--r--generators/datageneration.ipynb367
1 files changed, 367 insertions, 0 deletions
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb
new file mode 100644
index 0000000..beeb901
--- /dev/null
+++ b/generators/datageneration.ipynb
@@ -0,0 +1,367 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "id": "e1b17564-0abb-41c5-8cf4-7200b014550f",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n",
+ "[nltk_data] Package wordnet is already up-to-date!\n",
+ "[nltk_data] Downloading package words to /home/sy/nltk_data...\n",
+ "[nltk_data] Package words is already up-to-date!\n"
+ ]
+ }
+ ],
+ "source": [
+ "import json\n",
+ "import nltk\n",
+ "from nltk.corpus import wordnet as wn\n",
+ "from nltk.stem.wordnet import WordNetLemmatizer\n",
+ "nltk.download('wordnet')\n",
+ "from nltk.corpus import words\n",
+ "nltk.download('words')\n",
+ "ww = words.words()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 154,
+ "id": "8fe45bc7-a41a-49db-9067-700254f388c0",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def format(s):\n",
+ " return ' '.join(s.split('_'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 229,
+ "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from collections import defaultdict\n",
+ "dsynonyms = defaultdict(set)\n",
+ "n=0\n",
+ "\n",
+ "for word in wn.words():\n",
+ " n+=1\n",
+ " synsets = wn.synsets(word)\n",
+ " synonymss = wn.synonyms(word)\n",
+ " syns = set()\n",
+ " for synset, synonyms in zip(synsets, synonymss):\n",
+ " if synset.pos() in ['a', 's']:\n",
+ " syns |= set(synonyms)\n",
+ " if len(syns) >= 4:\n",
+ " clues = [format(clue) for clue in syns]\n",
+ " ok = True\n",
+ " for clue in clues:\n",
+ " if clue in dsynonyms:\n",
+ " ok = False\n",
+ " if ok:\n",
+ " clues.append(format(word))\n",
+ " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 230,
+ "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Synset('spanish_lime.n.01'), Synset('genip.n.02')]"
+ ]
+ },
+ "execution_count": 230,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wn.synsets('genip')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 231,
+ "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# flag button for reporting"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "1464f8df-180a-4334-b123-d76303140a03",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 232,
+ "id": "e588bdf3-d648-48b3-ab6b-027a07194292",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 233,
+ "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_csv('LADECv1-2019.csv', index_col=0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 234,
+ "id": "176e2790-560c-4daf-b436-a1771611c4bf",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.drop(df[df.correctParse == 'no'].index)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 235,
+ "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "prefixes = df.groupby('c1').groups\n",
+ "suffixes = df.groupby('c2').groups\n",
+ "pres = []\n",
+ "for prefix, ids in prefixes.items():\n",
+ " if len(ids) >= 4:\n",
+ " pres.append((prefix, list(df.loc[list(ids)].c2)))\n",
+ "sufs = []\n",
+ "for suffix, ids in suffixes.items():\n",
+ " if len(ids) >= 4:\n",
+ " sufs.append((suffix, list(df.loc[list(ids)].c1)))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 236,
+ "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "dprefix = {}\n",
+ "for prefix, ids in pres:\n",
+ " res = set()\n",
+ " for id in ids:\n",
+ " if (id[-1] == 's' and id[:-1] in ids) or (ids[-2:] == 'es' and ids[:-2] in ids):\n",
+ " continue\n",
+ " res.add(id)\n",
+ " if len(res) < 4:\n",
+ " continue\n",
+ " dprefix[prefix] = dict(group=f'{prefix} _', clues=list(res))\n",
+ "\n",
+ "dsuffix = {}\n",
+ "for suffix, ids in sufs:\n",
+ " if (suffix[-1] == 's' and suffix[:-1] in dsuffix) or (suffix[-2:] == 'es' and suffix[:-2] in ids):\n",
+ " #dsuffix[suffix[:-1]] = set(ids)\n",
+ " continue\n",
+ " if len(ids) < 4:\n",
+ " continue\n",
+ " dsuffix[suffix] = dict(group=f'_ {suffix}', clues=ids)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 237,
+ "id": "def43999-d789-4e5c-bb27-4fd29074c875",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from Levenshtein import ratio\n",
+ "def similar(a, b):\n",
+ " return ratio(a, b) >= .8\n",
+ "import inflect\n",
+ "\n",
+ "p = inflect.engine()\n",
+ "\n",
+ "def normalize(w):\n",
+ " pass\n",
+ "\n",
+ "def filter_duplicates(group):\n",
+ " if not group:\n",
+ " return []\n",
+ " ok = [group[0]]\n",
+ " for i in range(1, len(group)):\n",
+ " for word in ok:\n",
+ " if similar(word, group[i]):\n",
+ " break\n",
+ " else:\n",
+ " ok.append(group[i])\n",
+ " return ok"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 238,
+ "id": "6a3c04eb-79a6-47f5-846e-93258db65921",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "blacklist = ['man', 'men', 'woman', 'women']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 239,
+ "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "def process_corpus(corpus):\n",
+ " new = {}\n",
+ " for word, group in corpus.items():\n",
+ " clues = group['clues']\n",
+ " clues = [clue for clue in clues if clue not in blacklist]\n",
+ " clues = filter_duplicates(clues)\n",
+ " if len(clues) < 4:\n",
+ " continue\n",
+ " new[word] = dict(group=group['group'], clues=clues)\n",
+ " return new"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 240,
+ "id": "a59a4514-2572-4d35-a73d-fef58d1bc804",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "corpus = {**dprefix}\n",
+ "corpus.update(dsuffix)\n",
+ "corpus.update(dsynonyms)\n",
+ "filtered_corpus = process_corpus(corpus)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "8025664c-e116-481a-9609-d58200f773ec",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 241,
+ "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "\n",
+ "with open('../static/corpus.js', 'w') as f:\n",
+ " f.write('var corpus = ')\n",
+ " json.dump(filtered_corpus, f)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "4a82df07-568a-41f9-98c9-be0182522577",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 242,
+ "id": "46157b29-1084-4caa-be4f-7c56be562da8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[['encroach', 'impinge', 'infringe'],\n",
+ " ['encroach', 'entrench', 'impinge', 'trench'],\n",
+ " ['invasive', 'trespassing']]"
+ ]
+ },
+ "execution_count": 242,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "wn.synonyms('encroaching')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "ebcdf335-02c3-480c-a241-f83f7569acb0",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.5"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}