From 758156db378bfc16e598f8d4c62d429236fe6e12 Mon Sep 17 00:00:00 2001 From: cyfraeviolae Date: Sun, 19 Nov 2023 16:42:07 -0500 Subject: common pre/suf --- generators/datageneration.ipynb | 302 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 292 insertions(+), 10 deletions(-) (limited to 'generators') diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb index beeb901..457cc1d 100644 --- a/generators/datageneration.ipynb +++ b/generators/datageneration.ipynb @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": 243, "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", "metadata": {}, "outputs": [], @@ -146,12 +146,13 @@ }, { "cell_type": "code", - "execution_count": 234, + "execution_count": 244, "id": "176e2790-560c-4daf-b436-a1771611c4bf", "metadata": {}, "outputs": [], "source": [ - "df = df.drop(df[df.correctParse == 'no'].index)" + "df = df.drop(df[df.correctParse == 'no'].index)\n", + "df = df.drop(df[df.isCommonstim == 'no'].index)" ] }, { @@ -276,11 +277,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 259, "id": "8025664c-e116-481a-9609-d58200f773ec", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "437 330\n" + ] + } + ], + "source": [ + "print(len(dprefix), len(dsuffix))" + ] }, { "cell_type": "code", @@ -328,17 +339,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 252, "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c1c2stimisCommonC1isCommonC2isCommonstim
id_master
3237gadaboutgadabout111
4592knockaboutknockabout111
8231turnaboutturnabout111
6139raceaboutraceabout110
8331walkaboutwalkabout111
.....................
4515junkyardsjunkyards100
6812shipyardsshipyards100
2667farmyardsfarmyards100
1007brickyardsbrickyards100
8892zigzagzigzag001
\n", + "

8372 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", + "id_master \n", + "3237 gad about gadabout 1 1 1\n", + "4592 knock about knockabout 1 1 1\n", + "8231 turn about turnabout 1 1 1\n", + "6139 race about raceabout 1 1 0\n", + "8331 walk about walkabout 1 1 1\n", + "... ... ... ... ... ... ...\n", + "4515 junk yards junkyards 1 0 0\n", + "6812 ship yards shipyards 1 0 0\n", + "2667 farm yards farmyards 1 0 0\n", + "1007 brick yards brickyards 1 0 0\n", + "8892 zig zag zigzag 0 0 1\n", + "\n", + "[8372 rows x 6 columns]" + ] + }, + "execution_count": 252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 258, "id": "ebcdf335-02c3-480c-a241-f83f7569acb0", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
c1c2stimisCommonC1isCommonC2isCommonstim
id_master
8361warfarewarfare111
2715fieldfarefieldfare110
1298carfarecarfare111
51airfareairfare111
\n", + "
" + ], + "text/plain": [ + " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", + "id_master \n", + "8361 war fare warfare 1 1 1\n", + "2715 field fare fieldfare 1 1 0\n", + "1298 car fare carfare 1 1 1\n", + "51 air fare airfare 1 1 1" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.c2=='fare'][['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50989f8d-368e-4b4d-ab6c-355efce36c93", + "metadata": {}, "outputs": [], "source": [] } -- cgit v1.2.3