diff options
Diffstat (limited to 'scraper.py')
-rwxr-xr-x | scraper.py | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/scraper.py b/scraper.py new file mode 100755 index 0000000..7f6144b --- /dev/null +++ b/scraper.py @@ -0,0 +1,27 @@ +#!/usr/bin/env python3 + +# yes, html is not a regular language + +import requests, sys +uris = sys.argv[1:] +import re +for uri in uris: + resp = requests.get(uri).text + lines = resp.split('\n') + license = None + author = None + for idx, line in enumerate(lines): + if 'Original file' in line: + uri = re.match(r'.*href="(.*?)".*', line).group(1) + elif 'licensetpl_short' in line: + license = line.rsplit('>')[-1] + elif author is None and 'Author</td' in line: + author = re.match(r'(.*?>)?(.*?)<.*', lines[idx+2]).group(2) + if not uri.startswith('http'): + uri = 'https:' + uri + r = requests.get(uri, allow_redirects=True) + local_filename = uri.split('/')[-1] + open("./static/assets/" + local_filename, 'wb').write(r.content) + print('[[mediums.works.figures]]') + print('file = "' + local_filename + '"') + print(f'byline = "{license}{author}"') |