-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfetch.py
112 lines (79 loc) · 2.79 KB
/
fetch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import os
from services.downloader import Downloader
from services.reader import read_json
from services.timer import Timer
from services.writer import Writer
from services.web_parser import extract_page
OVERWRITE = False
ITEMS_PATH = "./data/api/items.json"
CODES_PATH = "./data/api/codes.json"
downloader = Downloader()
amount_online = downloader.get_amount()
to_fetch = amount_online
print(f"Found {amount_online} documents on regeringen.se")
just_fetch_new = not OVERWRITE and os.path.exists(ITEMS_PATH)
timer = Timer()
if just_fetch_new:
codes = read_json(CODES_PATH)
items = read_json(ITEMS_PATH)
items.reverse()
stats = read_json("./data/api/latest_updated.json")
timer.set_latest_update(stats["latest_updated"])
amount_saved = stats["items"]
print(f"Found {amount_saved} existing items.")
print(f"Found {len(codes)} existing codes.")
delta = timer.get_delta()
to_fetch = abs(amount_online - amount_saved) + 10 + 5 * (delta - 1)
print(f"Fetching the latest {to_fetch} items...")
new_items, new_codes = downloader.get_latest_items(to_fetch)
if just_fetch_new:
new_items = [
i for i in new_items if Downloader.last_updated(i) > timer.day_before()
]
new_items.reverse()
new_urls = [item["url"] for item in new_items]
to_remove = []
for i, item in enumerate(items):
if item["url"] in new_urls:
to_remove.append(i)
for i in sorted(to_remove, reverse=True):
items.pop(i)
items.extend(new_items)
items.reverse()
codes.update(new_codes)
else:
items, codes = new_items, new_codes
Writer.write_json(items, ITEMS_PATH)
Writer.write_json(codes, CODES_PATH)
for item in items:
url = item["url"]
if "attachments" in item or "201314184" in url:
continue
print(f"Fetching page at {url}...")
page = downloader.get_webpage(url)
if not page:
print(f"Error: {url}")
continue
md_content, metadata = extract_page(page)
if not md_content:
print(f"Error: {url}")
continue
Writer.write_md(md_content, "data/" + item["url"].strip("/") + ".md")
for category in metadata["categories"]:
codes[category[0]] = category[1]
metadata["categories"] = [category[0] for category in metadata["categories"]]
item.update(metadata)
codes = {str(key): codes[key] for key in sorted(codes)}
latest_updated = {
"latest_updated": timer.start_string(),
"items": len(items),
"codes": len(codes),
}
Writer.write_json(items, ITEMS_PATH)
Writer.write_json(codes, CODES_PATH)
Writer.write_json(latest_updated, "./data/api/latest_updated.json")
types = read_json("./types.json")
def get(type, items):
return [item for item in items if f"/{type}/" in item["url"]]
for type in types:
Writer.write_json(get(type, items), f"./data/{type}.json")