forked from frnsys/bz_analysis
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlinks.py
116 lines (91 loc) · 3.64 KB
/
links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
import re
import json
import requests
import pandas as pd
from glob import glob
from tqdm import tqdm
from itertools import chain
from functools import partial
from parallel import run_parallel
LINK_RE = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
MD_LINK_RE = re.compile(r'\[([^)]*)\]\(`?([^`\(\)]+)`?\)')
def extract_links(df, col, id_col):
all_links = {}
for _, (id, row, score) in tqdm(df[[id_col, col, 'score']].iterrows()):
try:
links = MD_LINK_RE.findall(row)
except TypeError:
# nan, keep going
continue
# get links that aren't in markdown form
raw_links = LINK_RE.findall(row)
# the extracted urls sometimes have trailing ')' and additional
# punctuation, so clean them
raw_links = [l.rsplit(')', 1)[0] for l in raw_links]
# only add links that aren't already captured by the markdown regex
urls = [l for _, l in links]
links.extend([('', l) for l in raw_links if l not in urls])
if links:
all_links[id] = {
'links': links,
'score': score
}
return all_links
def download_image(dir, url):
res = requests.get(url, stream=True)
fname = url.split('/')[-1]
path = os.path.join(dir, fname)
if res.status_code == 200:
with open(path, 'wb') as f:
for chunk in res:
f.write(chunk)
else:
print('failed to download:', url)
# res.raise_for_status()
def extract_comment_links(name, download_images=False):
links = {}
for csv in glob('data/reddit/{}/comments/*.csv'.format(name)):
df = pd.read_csv(csv)
links.update(extract_links(df, 'body', 'id'))
with open('data/reddit/{}/links/comment_links.json'.format(name), 'w') as f:
json.dump(links, f)
links = list(chain.from_iterable(v['links'] for v in links.values()))
links = [l for _, l in links]
exts = ['.jpg', '.jpeg', '.png', '.gif', '.gifv']
imgs = set([l for l in links if any(e in l for e in exts)])
print('images:', len(imgs))
with open('data/reddit/{}/links/comment_images.txt'.format(name), 'w') as f:
f.write('\n'.join(imgs))
if download_images:
run_parallel(imgs, partial(download_image, 'data/reddit/{}/images/comments'.format(name)))
def extract_post_links(name, download_images=False):
links = {}
for csv in glob('data/reddit/{}/posts/*.csv'.format(name)):
df = pd.read_csv(csv)
df['link'] = df['url']
links.update(df.set_index('id')[['link', 'score']].to_dict(orient='index'))
for jsn in glob('data/reddit/{}/posts/*.json'.format(name)):
df = json.load(open(jsn, 'r'))
links.update({d['id']: {
'link': d['url'],
'score': d['score']
} for d in df})
with open('data/reddit/{}/links/post_links.json'.format(name), 'w') as f:
json.dump(links, f)
links = [v['link'] for v in links.values()]
exts = ['.jpg', '.jpeg', '.png', '.gif', '.gifv']
imgs = set([l for l in links if any(e in l for e in exts)])
print('images:', len(imgs))
with open('data/reddit/{}/links/post_images.txt'.format(name), 'w') as f:
f.write('\n'.join(imgs))
if download_images:
run_parallel(imgs, partial(download_image, 'data/reddit/{}/images/posts'.format(name)))
if __name__ == '__main__':
download_images = False
for name in ['altright', 'the_donald']:
print(name)
print('-> comments')
extract_comment_links(name, download_images)
print('-> posts')
extract_post_links(name, download_images)