summaryrefslogtreecommitdiff
path: root/generators
diff options
context:
space:
mode:
authorcyfraeviolae <cyfraeviolae>2023-11-19 16:42:07 -0500
committercyfraeviolae <cyfraeviolae>2023-11-19 16:42:07 -0500
commit758156db378bfc16e598f8d4c62d429236fe6e12 (patch)
tree47e3d618f604ffb19fc62fc2ce5f4f27de3a06e8 /generators
parent8b1d9b24ab0c92c7ab1517ccecb4903017ad53d8 (diff)
common pre/suf
Diffstat (limited to 'generators')
-rw-r--r--generators/datageneration.ipynb302
1 files changed, 292 insertions, 10 deletions
diff --git a/generators/datageneration.ipynb b/generators/datageneration.ipynb
index beeb901..457cc1d 100644
--- a/generators/datageneration.ipynb
+++ b/generators/datageneration.ipynb
@@ -136,7 +136,7 @@
},
{
"cell_type": "code",
- "execution_count": 233,
+ "execution_count": 243,
"id": "b27aa837-73d2-4b10-826b-990e12a3f7e2",
"metadata": {},
"outputs": [],
@@ -146,12 +146,13 @@
},
{
"cell_type": "code",
- "execution_count": 234,
+ "execution_count": 244,
"id": "176e2790-560c-4daf-b436-a1771611c4bf",
"metadata": {},
"outputs": [],
"source": [
- "df = df.drop(df[df.correctParse == 'no'].index)"
+ "df = df.drop(df[df.correctParse == 'no'].index)\n",
+ "df = df.drop(df[df.isCommonstim == 'no'].index)"
]
},
{
@@ -276,11 +277,21 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 259,
"id": "8025664c-e116-481a-9609-d58200f773ec",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "437 330\n"
+ ]
+ }
+ ],
+ "source": [
+ "print(len(dprefix), len(dsuffix))"
+ ]
},
{
"cell_type": "code",
@@ -328,17 +339,288 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 252,
"id": "98e6a79f-4e7b-498d-a824-a44b52ae3829",
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>c1</th>\n",
+ " <th>c2</th>\n",
+ " <th>stim</th>\n",
+ " <th>isCommonC1</th>\n",
+ " <th>isCommonC2</th>\n",
+ " <th>isCommonstim</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>id_master</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>3237</th>\n",
+ " <td>gad</td>\n",
+ " <td>about</td>\n",
+ " <td>gadabout</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4592</th>\n",
+ " <td>knock</td>\n",
+ " <td>about</td>\n",
+ " <td>knockabout</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8231</th>\n",
+ " <td>turn</td>\n",
+ " <td>about</td>\n",
+ " <td>turnabout</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6139</th>\n",
+ " <td>race</td>\n",
+ " <td>about</td>\n",
+ " <td>raceabout</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8331</th>\n",
+ " <td>walk</td>\n",
+ " <td>about</td>\n",
+ " <td>walkabout</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>...</th>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " <td>...</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>4515</th>\n",
+ " <td>junk</td>\n",
+ " <td>yards</td>\n",
+ " <td>junkyards</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>6812</th>\n",
+ " <td>ship</td>\n",
+ " <td>yards</td>\n",
+ " <td>shipyards</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2667</th>\n",
+ " <td>farm</td>\n",
+ " <td>yards</td>\n",
+ " <td>farmyards</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1007</th>\n",
+ " <td>brick</td>\n",
+ " <td>yards</td>\n",
+ " <td>brickyards</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>8892</th>\n",
+ " <td>zig</td>\n",
+ " <td>zag</td>\n",
+ " <td>zigzag</td>\n",
+ " <td>0</td>\n",
+ " <td>0</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "<p>8372 rows × 6 columns</p>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n",
+ "id_master \n",
+ "3237 gad about gadabout 1 1 1\n",
+ "4592 knock about knockabout 1 1 1\n",
+ "8231 turn about turnabout 1 1 1\n",
+ "6139 race about raceabout 1 1 0\n",
+ "8331 walk about walkabout 1 1 1\n",
+ "... ... ... ... ... ... ...\n",
+ "4515 junk yards junkyards 1 0 0\n",
+ "6812 ship yards shipyards 1 0 0\n",
+ "2667 farm yards farmyards 1 0 0\n",
+ "1007 brick yards brickyards 1 0 0\n",
+ "8892 zig zag zigzag 0 0 1\n",
+ "\n",
+ "[8372 rows x 6 columns]"
+ ]
+ },
+ "execution_count": 252,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]"
+ ]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 258,
"id": "ebcdf335-02c3-480c-a241-f83f7569acb0",
"metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>c1</th>\n",
+ " <th>c2</th>\n",
+ " <th>stim</th>\n",
+ " <th>isCommonC1</th>\n",
+ " <th>isCommonC2</th>\n",
+ " <th>isCommonstim</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>id_master</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>8361</th>\n",
+ " <td>war</td>\n",
+ " <td>fare</td>\n",
+ " <td>warfare</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>2715</th>\n",
+ " <td>field</td>\n",
+ " <td>fare</td>\n",
+ " <td>fieldfare</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1298</th>\n",
+ " <td>car</td>\n",
+ " <td>fare</td>\n",
+ " <td>carfare</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>51</th>\n",
+ " <td>air</td>\n",
+ " <td>fare</td>\n",
+ " <td>airfare</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " <td>1</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " c1 c2 stim isCommonC1 isCommonC2 isCommonstim\n",
+ "id_master \n",
+ "8361 war fare warfare 1 1 1\n",
+ "2715 field fare fieldfare 1 1 0\n",
+ "1298 car fare carfare 1 1 1\n",
+ "51 air fare airfare 1 1 1"
+ ]
+ },
+ "execution_count": 258,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df.c2=='fare'][['c1', 'c2', 'stim', 'isCommonC1', 'isCommonC2', 'isCommonstim']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "50989f8d-368e-4b4d-ab6c-355efce36c93",
+ "metadata": {},
"outputs": [],
"source": []
}