diff options
Diffstat (limited to 'generators')
-rw-r--r-- | generators/datageneration.ipynb | 302 |
1 files changed, 292 insertions, 10 deletions
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb index beeb901..457cc1d 100644 --- a/generators/datageneration.ipynb +++ b/generators/datageneration.ipynb @@ -136,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 233, + "execution_count": 243, "id": "b27aa837-73d2-4b10-826b-990e12a3f7e2", "metadata": {}, "outputs": [], @@ -146,12 +146,13 @@ }, { "cell_type": "code", - "execution_count": 234, + "execution_count": 244, "id": "176e2790-560c-4daf-b436-a1771611c4bf", "metadata": {}, "outputs": [], "source": [ - "df = df.drop(df[df.correctParse == 'no'].index)" + "df = df.drop(df[df.correctParse == 'no'].index)\n", + "df = df.drop(df[df.isCommonstim == 'no'].index)" ] }, { @@ -276,11 +277,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 259, "id": "8025664c-e116-481a-9609-d58200f773ec", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "437 330\n" + ] + } + ], + "source": [ + "print(len(dprefix), len(dsuffix))" + ] }, { "cell_type": "code", @@ -328,17 +339,288 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 252, "id": "98e6a79f-4e7b-498d-a824-a44b52ae3829", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>c1</th>\n", + " <th>c2</th>\n", + " <th>stim</th>\n", + " <th>isCommonC1</th>\n", + " <th>isCommonC2</th>\n", + " <th>isCommonstim</th>\n", + " </tr>\n", + " <tr>\n", + " <th>id_master</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>3237</th>\n", + " <td>gad</td>\n", + " <td>about</td>\n", + " <td>gadabout</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4592</th>\n", + " <td>knock</td>\n", + " <td>about</td>\n", + " <td>knockabout</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8231</th>\n", + " <td>turn</td>\n", + " <td>about</td>\n", + " <td>turnabout</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6139</th>\n", + " <td>race</td>\n", + " <td>about</td>\n", + " <td>raceabout</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8331</th>\n", + " <td>walk</td>\n", + " <td>about</td>\n", + " <td>walkabout</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>...</th>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " <td>...</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4515</th>\n", + " <td>junk</td>\n", + " <td>yards</td>\n", + " <td>junkyards</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>6812</th>\n", + " <td>ship</td>\n", + " <td>yards</td>\n", + " <td>shipyards</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2667</th>\n", + " <td>farm</td>\n", + " <td>yards</td>\n", + " <td>farmyards</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1007</th>\n", + " <td>brick</td>\n", + " <td>yards</td>\n", + " <td>brickyards</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>8892</th>\n", + " <td>zig</td>\n", + " <td>zag</td>\n", + " <td>zigzag</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>8372 rows × 6 columns</p>\n", + "</div>" + ], + "text/plain": [ + " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", + "id_master \n", + "3237 gad about gadabout 1 1 1\n", + "4592 knock about knockabout 1 1 1\n", + "8231 turn about turnabout 1 1 1\n", + "6139 race about raceabout 1 1 0\n", + "8331 walk about walkabout 1 1 1\n", + "... ... ... ... ... ... ...\n", + "4515 junk yards junkyards 1 0 0\n", + "6812 ship yards shipyards 1 0 0\n", + "2667 farm yards farmyards 1 0 0\n", + "1007 brick yards brickyards 1 0 0\n", + "8892 zig zag zigzag 0 0 1\n", + "\n", + "[8372 rows x 6 columns]" + ] + }, + "execution_count": 252, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 258, "id": "ebcdf335-02c3-480c-a241-f83f7569acb0", "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>c1</th>\n", + " <th>c2</th>\n", + " <th>stim</th>\n", + " <th>isCommonC1</th>\n", + " <th>isCommonC2</th>\n", + " <th>isCommonstim</th>\n", + " </tr>\n", + " <tr>\n", + " <th>id_master</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>8361</th>\n", + " <td>war</td>\n", + " <td>fare</td>\n", + " <td>warfare</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2715</th>\n", + " <td>field</td>\n", + " <td>fare</td>\n", + " <td>fieldfare</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1298</th>\n", + " <td>car</td>\n", + " <td>fare</td>\n", + " <td>carfare</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " <tr>\n", + " <th>51</th>\n", + " <td>air</td>\n", + " <td>fare</td>\n", + " <td>airfare</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n", + "id_master \n", + "8361 war fare warfare 1 1 1\n", + "2715 field fare fieldfare 1 1 0\n", + "1298 car fare carfare 1 1 1\n", + "51 air fare airfare 1 1 1" + ] + }, + "execution_count": 258, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.c2=='fare'][['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50989f8d-368e-4b4d-ab6c-355efce36c93", + "metadata": {}, "outputs": [], "source": [] } |