talk/pics/original/scrape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36

#!/usr/bin/env python3

import os
import re
import sys
import requests
from bs4 import BeautifulSoup
import IPython
from os import path

if __name__ != '__main__':
    raise ImportError('This is a command-line script and not supposed to be imported.')

pic_ids = [ re.match(r'.*-([0-9a-zA-Z-]{11})-unsplash\.jpg$', fn) for fn in os.listdir() ]
pic_ids = [ match.group(1) for match in pic_ids if match ]

for id in pic_ids:
    try:
        res = requests.get(f'https://unsplash.com/photos/{id}')
        soup = BeautifulSoup(res.text, features='lxml')

        title = soup.find('title').text
        match = re.match(r'(.*) photo – Free (.*)Image on Unsplash', title)
        if match:
            title, category = match.groups()
        else:
            match = re.match(r'Free (.*)Image on Unsplash', title)
            category, = match.groups()

        alts = [ img['alt'] for img in [ a.findChild('img') for a in soup.find_all('a') if a['href'].startswith('/@') ] if img ]
        name = re.match("Go to (.*)'s profile", alts[0]).group(1)

        print(f'{name}: {title if title else category.strip()}')
    except:
        print(id, file=sys.stderr)