From 02b99889178b5f7b1c54437e8f7f9d13d6b8da08 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Sun, 19 Nov 2023 15:47:12 -0500 Subject: init --- generators/datageneration.ipynb | 367 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 367 insertions(+) create mode 100644 generators/datageneration.ipynb (limited to 'generators/datageneration.ipynb') diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb new file mode 100644 index 0000000..beeb901 --- /dev/null +++ b/generators/datageneration.ipynb @@ -0,0 +1,367 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 59, + "id": "e1b17564-0abb-41c5-8cf4-7200b014550f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package wordnet to /home/sy/nltk_data...\n", + "[nltk_data] Package wordnet is already up-to-date!\n", + "[nltk_data] Downloading package words to /home/sy/nltk_data...\n", + "[nltk_data] Package words is already up-to-date!\n" + ] + } + ], + "source": [ + "import json\n", + "import nltk\n", + "from nltk.corpus import wordnet as wn\n", + "from nltk.stem.wordnet import WordNetLemmatizer\n", + "nltk.download('wordnet')\n", + "from nltk.corpus import words\n", + "nltk.download('words')\n", + "ww = words.words()" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "8fe45bc7-a41a-49db-9067-700254f388c0", + "metadata": {}, + "outputs": [], + "source": [ + "def format(s):\n", + " return ' '.join(s.split('_'))" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "c75240e0-8392-4a7b-9999-dc528b3d17a1", + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "dsynonyms = defaultdict(set)\n", + "n=0\n", + "\n", + "for word in wn.words():\n", + " n+=1\n", + " synsets = wn.synsets(word)\n", + " synonymss = wn.synonyms(word)\n", + " syns = set()\n", + " for synset, synonyms in zip(synsets, synonymss):\n", + " if synset.pos() in ['a', 's']:\n", + " syns |= set(synonyms)\n", + " if len(syns) >= 4:\n", + " clues = [format(clue) for clue in syns]\n", + " ok = True\n", + " for clue in clues:\n", + " if clue in dsynonyms:\n", + " ok = False\n", + " if ok:\n", + " clues.append(format(word))\n", + " dsynonyms[word] = dict(group=f'synonyms for {format(word)}', clues=clues)" + ] + }, + { + "cell_type": "code", + "execution_count": 230, + "id": "7e552fc8-03b2-4b8f-b6f6-072d580702bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Synset('spanish_lime.n.01'), Synset('genip.n.02')]" + ] + }, + "execution_count": 230, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wn.synsets('genip')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8d780210-38a4-4f8d-ae2e-b4631cb06368", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 231, + "id": "48233554-2634-4a5e-9013-4e45c6f7d3d9", + "metadata": {}, + "outputs": [], + "source": [ + "# flag button for reporting" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1464f8df-180a-4334-b123-d76303140a03", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ecc5527c-a0b0-4e48-ae0a-4cb3a1a8a12b", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "e588bdf3-d648-48b3-ab6b-027a07194292", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('LADECv1-2019.csv', index_col=0)" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "id": "176e2790-560c-4daf-b436-a1771611c4bf", + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop(df[df.correctParse == 'no'].index)" + ] + }, + { + "cell_type": "code", + "execution_count": 235, + "id": "64ccaf3d-9743-49ed-b10b-d7b3e70e0235", + "metadata": {}, + "outputs": [], + "source": [ + "prefixes = df.groupby('c1').groups\n", + "suffixes = df.groupby('c2').groups\n", + "pres = []\n", + "for prefix, ids in prefixes.items():\n", + " if len(ids) >= 4:\n", + " pres.append((prefix, list(df.loc[list(ids)].c2)))\n", + "sufs = []\n", + "for suffix, ids in suffixes.items():\n", + " if len(ids) >= 4:\n", + " sufs.append((suffix, list(df.loc[list(ids)].c1)))" + ] + }, + { + "cell_type": "code", + "execution_count": 236, + "id": "86c69c9f-bc6a-4dd1-9eb3-ab37ad766586", + "metadata": {}, + "outputs": [], + "source": [ + "dprefix = {}\n", + "for prefix, ids in pres:\n", + " res = set()\n", + " for id in ids:\n", + " if (id[-1] == 's' and id[:-1] in ids) or (ids[-2:] == 'es' and ids[:-2] in ids):\n", + " continue\n", + " res.add(id)\n", + " if len(res) < 4:\n", + " continue\n", + " dprefix[prefix] = dict(group=f'{prefix} _', clues=list(res))\n", + "\n", + "dsuffix = {}\n", + "for suffix, ids in sufs:\n", + " if (suffix[-1] == 's' and suffix[:-1] in dsuffix) or (suffix[-2:] == 'es' and suffix[:-2] in ids):\n", + " #dsuffix[suffix[:-1]] = set(ids)\n", + " continue\n", + " if len(ids) < 4:\n", + " continue\n", + " dsuffix[suffix] = dict(group=f'_ {suffix}', clues=ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 237, + "id": "def43999-d789-4e5c-bb27-4fd29074c875", + "metadata": {}, + "outputs": [], + "source": [ + "from Levenshtein import ratio\n", + "def similar(a, b):\n", + " return ratio(a, b) >= .8\n", + "import inflect\n", + "\n", + "p = inflect.engine()\n", + "\n", + "def normalize(w):\n", + " pass\n", + "\n", + "def filter_duplicates(group):\n", + " if not group:\n", + " return []\n", + " ok = [group[0]]\n", + " for i in range(1, len(group)):\n", + " for word in ok:\n", + " if similar(word, group[i]):\n", + " break\n", + " else:\n", + " ok.append(group[i])\n", + " return ok" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "id": "6a3c04eb-79a6-47f5-846e-93258db65921", + "metadata": {}, + "outputs": [], + "source": [ + "blacklist = ['man', 'men', 'woman', 'women']" + ] + }, + { + "cell_type": "code", + "execution_count": 239, + "id": "dfb38b21-3dc4-495a-8805-446b2e9e8483", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "def process_corpus(corpus):\n", + " new = {}\n", + " for word, group in corpus.items():\n", + " clues = group['clues']\n", + " clues = [clue for clue in clues if clue not in blacklist]\n", + " clues = filter_duplicates(clues)\n", + " if len(clues) < 4:\n", + " continue\n", + " new[word] = dict(group=group['group'], clues=clues)\n", + " return new" + ] + }, + { + "cell_type": "code", + "execution_count": 240, + "id": "a59a4514-2572-4d35-a73d-fef58d1bc804", + "metadata": {}, + "outputs": [], + "source": [ + "corpus = {**dprefix}\n", + "corpus.update(dsuffix)\n", + "corpus.update(dsynonyms)\n", + "filtered_corpus = process_corpus(corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8025664c-e116-481a-9609-d58200f773ec", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 241, + "id": "fccac4d7-af42-4445-8dd5-6f4b0d3aa9ca", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "with open('../static/corpus.js', 'w') as f:\n", + " f.write('var corpus = ')\n", + " json.dump(filtered_corpus, f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a82df07-568a-41f9-98c9-be0182522577", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 242, + "id": "46157b29-1084-4caa-be4f-7c56be562da8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[['encroach', 'impinge', 'infringe'],\n", + " ['encroach', 'entrench', 'impinge', 'trench'],\n", + " ['invasive', 'trespassing']]" + ] + }, + "execution_count": 242, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wn.synonyms('encroaching')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebcdf335-02c3-480c-a241-f83f7569acb0", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} -- cgit v1.2.3