summaryrefslogtreecommitdiff
path: root/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'scraper.py')
-rwxr-xr-xscraper.py27
1 files changed, 27 insertions, 0 deletions
diff --git a/scraper.py b/scraper.py
new file mode 100755
index 0000000..7f6144b
--- /dev/null
+++ b/scraper.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+
+# yes, html is not a regular language
+
+import requests, sys
+uris = sys.argv[1:]
+import re
+for uri in uris:
+ resp = requests.get(uri).text
+ lines = resp.split('\n')
+ license = None
+ author = None
+ for idx, line in enumerate(lines):
+ if 'Original file' in line:
+ uri = re.match(r'.*href="(.*?)".*', line).group(1)
+ elif 'licensetpl_short' in line:
+ license = line.rsplit('>')[-1]
+ elif author is None and 'Author</td' in line:
+ author = re.match(r'(.*?>)?(.*?)<.*', lines[idx+2]).group(2)
+ if not uri.startswith('http'):
+ uri = 'https:' + uri
+ r = requests.get(uri, allow_redirects=True)
+ local_filename = uri.split('/')[-1]
+ open("./static/assets/" + local_filename, 'wb').write(r.content)
+ print('[[mediums.works.figures]]')
+ print('file = "' + local_filename + '"')
+ print(f'byline = "{license}{author}"')