forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimmoscout24.py
88 lines (78 loc) · 3.27 KB
/
immoscout24.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
"""
This is an example web scraper for immoscout24.ch.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import os
import json
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from typing import Dict, List
from pathlib import Path
from loguru import logger as log
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass web scraping blocking
"asp": True,
# set the proxy country to switzerland
"country": "CH",
}
output = Path(__file__).parent / "results"
output.mkdir(exist_ok=True)
def parse_next_data(response: ScrapeApiResponse) -> Dict:
"""parse listing data from script tags"""
selector = response.selector
# extract data in JSON from script tags
script = selector.xpath("//script[contains(text(),'INITIAL_STATE')]/text()").get()
if not script:
return
next_data = script.strip("window.__INITIAL_STATE__=")
# replace undefined values
next_data = next_data.replace("undefined", "null")
next_data_json = json.loads(next_data)
return next_data_json
async def scrape_properties(urls: List[str]) -> List[Dict]:
"""scrape listing data from immoscout24 proeprty pages"""
# add the property pages in a scraping list
to_scrape = [ScrapeConfig(url, **BASE_CONFIG) for url in urls]
properties = []
# scrape all property pages concurrently
async for response in SCRAPFLY.concurrent_scrape(to_scrape):
data = parse_next_data(response)
# handle expired property pages
try:
properties.append(data["listing"]["listing"])
except:
log.info("expired property page")
pass
log.info(f"scraped {len(properties)} property listings")
return properties
async def scrape_search(
url: str, scrape_all_pages: bool, max_scrape_pages: int = 10
) -> List[Dict]:
"""scrape listing data from immoscout24 search pages"""
# scrape the first search page first
first_page = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
log.info("scraping search page {}", url)
data = parse_next_data(first_page)["resultList"]["search"]["fullSearch"]["result"]
search_data = data["listings"]
# get the number of maximum search pages available
max_search_pages = data["resultCount"]
# scrape all available pages in the search if scrape_all_pages = True or max_pages > total_search_pages
if scrape_all_pages == False and max_scrape_pages < max_search_pages:
total_pages = max_scrape_pages
else:
total_pages = max_search_pages
log.info("scraping search {} pagination ({} more pages)", url, total_pages - 1)
# add the remaining search pages in a scraping list
other_pages = [
ScrapeConfig(first_page.context["url"] + f"?pn={page}", asp=True, country="CH")
for page in range(2, total_pages + 1)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = parse_next_data(response)
search_data.extend(
data["resultList"]["search"]["fullSearch"]["result"]["listings"]
)
log.info("scraped {} proprties from {}", len(search_data), url)
return search_data