1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
{
"cells": [
{
"cell_type": "code",
"execution_count": 50,
"id": "9fdccc99-04bd-4d5b-b632-101b669ffb7e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"\n",
"import pandas as pd\n",
"import pyarrow as pa\n",
"import pyarrow.parquet as pq\n",
"import pyarrow.dataset as ds\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"id": "a10bff4c-f5f6-4102-8151-2c4020b9d2d6",
"metadata": {},
"outputs": [],
"source": [
"n = 10000\n",
"df = ds.dataset('data/').scanner().head(n).to_pandas()"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "b07c097e-6857-4fad-900a-a9643fb5dc9d",
"metadata": {},
"outputs": [],
"source": [
"for i in range(n):\n",
" row = df.iloc[i]\n",
" with open(f'corpus/{i}.jpg', 'wb') as f:\n",
" f.write(row.image['bytes'])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "2080bb6f-44df-4b45-8bc2-308105b89e09",
"metadata": {},
"outputs": [],
"source": [
"artists = {\n",
" i: int(df.iloc[i].artist) for i in range(n)\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "e91afe69-1722-426f-862e-b3ce98207d35",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"with open(f'artists.js', 'w') as f:\n",
" f.write('var artists = ' + json.dumps(artists))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f44450c8-30c5-4fc1-90a8-53e33b94ee65",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "649daac5-2a2c-4af2-8120-e5b80838bd9b",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|