diff options
author | jaseg <git@jaseg.net> | 2020-10-16 18:05:28 +0200 |
---|---|---|
committer | jaseg <git@jaseg.net> | 2020-10-16 18:06:04 +0200 |
commit | 1606044a40f6e4e13721a0f3766dbd9a28d67479 (patch) | |
tree | e5629e14d50861ba684ec46cd342127b1a22dbbf /talk/pics/original | |
parent | c2a26653af38f8056627d989365bc42082fe8f04 (diff) | |
download | master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.gz master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.bz2 master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.zip |
talk: Small fixes, add source list
Diffstat (limited to 'talk/pics/original')
-rw-r--r-- | talk/pics/original/scrape.py | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/talk/pics/original/scrape.py b/talk/pics/original/scrape.py new file mode 100644 index 0000000..2191d09 --- /dev/null +++ b/talk/pics/original/scrape.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python3 + +import os +import re +import sys +import requests +from bs4 import BeautifulSoup +import IPython +from os import path + +if __name__ != '__main__': + raise ImportError('This is a command-line script and not supposed to be imported.') + +pic_ids = [ re.match(r'.*-([0-9a-zA-Z-]{11})-unsplash\.jpg$', fn) for fn in os.listdir() ] +pic_ids = [ match.group(1) for match in pic_ids if match ] + +for id in pic_ids: + try: + res = requests.get(f'https://unsplash.com/photos/{id}') + soup = BeautifulSoup(res.text, features='lxml') + + title = soup.find('title').text + match = re.match(r'(.*) photo – Free (.*)Image on Unsplash', title) + if match: + title, category = match.groups() + else: + match = re.match(r'Free (.*)Image on Unsplash', title) + category, = match.groups() + + alts = [ img['alt'] for img in [ a.findChild('img') for a in soup.find_all('a') if a['href'].startswith('/@') ] if img ] + name = re.match("Go to (.*)'s profile", alts[0]).group(1) + + print(f'{name}: {title if title else category.strip()}') + except: + print(id, file=sys.stderr) + |