{ "cells": [ { "cell_type": "code", "execution_count": 50, "id": "9fdccc99-04bd-4d5b-b632-101b669ffb7e", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "import pandas as pd\n", "import pyarrow as pa\n", "import pyarrow.parquet as pq\n", "import pyarrow.dataset as ds\n" ] }, { "cell_type": "code", "execution_count": 51, "id": "a10bff4c-f5f6-4102-8151-2c4020b9d2d6", "metadata": {}, "outputs": [], "source": [ "n = 10000\n", "df = ds.dataset('data/').scanner().head(n).to_pandas()" ] }, { "cell_type": "code", "execution_count": 49, "id": "b07c097e-6857-4fad-900a-a9643fb5dc9d", "metadata": {}, "outputs": [], "source": [ "for i in range(n):\n", " row = df.iloc[i]\n", " with open(f'corpus/{i}.jpg', 'wb') as f:\n", " f.write(row.image['bytes'])" ] }, { "cell_type": "code", "execution_count": 62, "id": "2080bb6f-44df-4b45-8bc2-308105b89e09", "metadata": {}, "outputs": [], "source": [ "artists = {\n", " i: int(df.iloc[i].artist) for i in range(n)\n", "}" ] }, { "cell_type": "code", "execution_count": 63, "id": "e91afe69-1722-426f-862e-b3ce98207d35", "metadata": {}, "outputs": [], "source": [ "import json\n", "with open(f'artists.js', 'w') as f:\n", " f.write('var artists = ' + json.dumps(artists))" ] }, { "cell_type": "code", "execution_count": null, "id": "f44450c8-30c5-4fc1-90a8-53e33b94ee65", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "649daac5-2a2c-4af2-8120-e5b80838bd9b", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 5 }