talk: Small fixes, add source list

author: jaseg <git@jaseg.net> 2020-10-16 18:05:28 +0200
committer: jaseg <git@jaseg.net> 2020-10-16 18:06:04 +0200
commit: 1606044a40f6e4e13721a0f3766dbd9a28d67479 (patch)
tree: e5629e14d50861ba684ec46cd342127b1a22dbbf /talk/pics/original
parent: c2a26653af38f8056627d989365bc42082fe8f04 (diff)
download: master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.gz
master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.bz2
master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.zip
1 files changed, 36 insertions, 0 deletions
diff --git a/talk/pics/original/scrape.py b/talk/pics/original/scrape.py
new file mode 100644
index 0000000..2191d09
--- /dev/null
+++ b/talk/pics/original/scrape.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+
+import os
+import re
+import sys
+import requests
+from bs4 import BeautifulSoup
+import IPython
+from os import path
+
+if __name__ != '__main__':
+    raise ImportError('This is a command-line script and not supposed to be imported.')
+
+pic_ids = [ re.match(r'.*-([0-9a-zA-Z-]{11})-unsplash\.jpg$', fn) for fn in os.listdir() ]
+pic_ids = [ match.group(1) for match in pic_ids if match ]
+
+for id in pic_ids:
+    try:
+        res = requests.get(f'https://unsplash.com/photos/{id}')
+        soup = BeautifulSoup(res.text, features='lxml')
+
+        title = soup.find('title').text
+        match = re.match(r'(.*) photo – Free (.*)Image on Unsplash', title)
+        if match:
+            title, category = match.groups()
+        else:
+            match = re.match(r'Free (.*)Image on Unsplash', title)
+            category, = match.groups()
+
+        alts = [ img['alt'] for img in [ a.findChild('img') for a in soup.find_all('a') if a['href'].startswith('/@') ] if img ]
+        name = re.match("Go to (.*)'s profile", alts[0]).group(1)
+
+        print(f'{name}: {title if title else category.strip()}')
+    except:
+        print(id, file=sys.stderr)
+
author	jaseg <git@jaseg.net>	2020-10-16 18:05:28 +0200
committer	jaseg <git@jaseg.net>	2020-10-16 18:06:04 +0200
commit	1606044a40f6e4e13721a0f3766dbd9a28d67479 (patch)
tree	e5629e14d50861ba684ec46cd342127b1a22dbbf /talk/pics/original
parent	c2a26653af38f8056627d989365bc42082fe8f04 (diff)
download	master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.gz master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.tar.bz2 master-thesis-1606044a40f6e4e13721a0f3766dbd9a28d67479.zip