summaryrefslogtreecommitdiff
path: root/talk/pics/original
diff options
context:
space:
mode:
authorjaseg <git@jaseg.net>2020-10-16 18:05:28 +0200
committerjaseg <git@jaseg.net>2020-10-16 18:06:04 +0200
commit1606044a40f6e4e13721a0f3766dbd9a28d67479 (patch)
treee5629e14d50861ba684ec46cd342127b1a22dbbf /talk/pics/original
parentc2a26653af38f8056627d989365bc42082fe8f04 (diff)
downloadmaster-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.gz
master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.bz2
master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.zip
talk: Small fixes, add source list
Diffstat (limited to 'talk/pics/original')
-rw-r--r--talk/pics/original/scrape.py36
1 files changed, 36 insertions, 0 deletions
diff --git a/talk/pics/original/scrape.py b/talk/pics/original/scrape.py
new file mode 100644
index 0000000..2191d09
--- /dev/null
+++ b/talk/pics/original/scrape.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import sys
+import requests
+from bs4 import BeautifulSoup
+import IPython
+from os import path
+
+if __name__ != '__main__':
+ raise ImportError('This is a command-line script and not supposed to be imported.')
+
+pic_ids = [ re.match(r'.*-([0-9a-zA-Z-]{11})-unsplash\.jpg$', fn) for fn in os.listdir() ]
+pic_ids = [ match.group(1) for match in pic_ids if match ]
+
+for id in pic_ids:
+ try:
+ res = requests.get(f'https://unsplash.com/photos/{id}')
+ soup = BeautifulSoup(res.text, features='lxml')
+
+ title = soup.find('title').text
+ match = re.match(r'(.*) photo – Free (.*)Image on Unsplash', title)
+ if match:
+ title, category = match.groups()
+ else:
+ match = re.match(r'Free (.*)Image on Unsplash', title)
+ category, = match.groups()
+
+ alts = [ img['alt'] for img in [ a.findChild('img') for a in soup.find_all('a') if a['href'].startswith('/@') ] if img ]
+ name = re.match("Go to (.*)'s profile", alts[0]).group(1)
+
+ print(f'{name}: {title if title else category.strip()}')
+ except:
+ print(id, file=sys.stderr)
+