forked from scrapfly/scrapfly-scrapers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbing.py
148 lines (128 loc) · 5.98 KB
/
bing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
This is an example web scraper for bing.com.
To run this scraper set env variable $SCRAPFLY_KEY with your scrapfly API key:
$ export $SCRAPFLY_KEY="your key from https://scrapfly.io/dashboard"
"""
import re
import os
from scrapfly import ScrapeConfig, ScrapflyClient, ScrapeApiResponse
from typing import Dict, List
from urllib.parse import urlencode
from loguru import logger as log
SCRAPFLY = ScrapflyClient(key=os.environ["SCRAPFLY_KEY"])
BASE_CONFIG = {
# bypass Bing web scraping blocking
"asp": True,
# set the poxy location to US to get the result in English
"country": "US",
}
def parse_serps(response: ScrapeApiResponse) -> List[Dict]:
"""parse SERPs from bing search pages"""
selector = response.selector
data = []
if "first" not in response.context["url"]:
position = 0
else:
position = int(response.context["url"].split("first=")[-1])
for result in selector.xpath("//li[@class='b_algo']"):
url = result.xpath(".//h2/a/@href").get()
description = result.xpath("normalize-space(.//div/p)").extract_first()
date = result.xpath(".//span[@class='news_dt']/text()").get()
if data is not None and date is not None and len(date) > 12:
date_pattern = re.compile(r"\b\d{2}-\d{2}-\d{4}\b")
date_pattern.findall(description)
dates = date_pattern.findall(date)
date = dates[0] if dates else None
position += 1
data.append(
{
"position": position,
"title": "".join(result.xpath(".//h2/a//text()").extract()),
"url": url,
"origin": result.xpath(".//div[@class='tptt']/text()").get(),
"domain": url.split("https://")[-1].split("/")[0].replace("www.", "")
if url
else None,
"description": description,
"date": date,
}
)
return data
def parse_keywords(response: ScrapeApiResponse) -> Dict:
"""parse FAQs and popular keywords on bing search pages"""
selector = response.selector
faqs = []
for faq in selector.xpath("//div[@class='b_slidebar']/div/div[contains(@data-tag, 'QnA')]"):
url = faq.xpath(".//h2/a/@href").get()
faqs.append(
{
"query": faq.xpath("./@data-query").get(),
"answer": faq.xpath(".//span[contains(@data-tag, 'QnA')]/text()").get(),
"title": "".join(faq.xpath(".//div[@class='b_algo']/h2/a//text()").extract()),
"domain": url.split("https://")[-1].split("/")[0].replace("www.", "")if url else None,
"url": url,
}
)
related_keywords = []
for keyword in selector.xpath(".//li[@class='b_ans']/div/ul/li"):
related_keywords.append("".join(keyword.xpath(".//a/div//text()").extract()))
return {"FAQs": faqs, "related_keywords": related_keywords}
def parse_rich_snippet(response: ScrapeApiResponse) -> Dict:
"""parse rich snippets from Bing search"""
selector = response.selector
data = {}
data["title"] = selector.xpath("//div[@class='l_ecrd_hero_ttl']/div/a/h2/span/text()").get()
data["link"] = selector.xpath("//div[@class='l_ecrd_hero_ttl']/div/a/@href").get()
data["heading"] = " ".join(selector.xpath("//a[@title]/h2/span/text()").getall())
data["links"] = {}
for item in selector.xpath("//div[contains(@class, 'webicons')]/div"):
name = item.xpath(".//a/@title").get()
link = item.xpath(".//a/@href").get()
data["links"][name] = link
data["info"] = {}
for row in selector.xpath("//div[contains(@class, 'expansion')]/div[contains(@class, 'row')]"):
key = row.xpath(".//div/div/a[1]/text()").get().strip()
value = row.xpath("string(.//div[not(contains(@class, 'title'))])").get().strip().replace(key, "")
data["info"][key] = value
all_text = ""
for div_element in selector.xpath("//div[@class='lite-entcard-blk l_ecrd_bkg_hlt']"):
div_text = div_element.xpath("string(.)").get().strip()
all_text += div_text + "\n"
data["descrption"] = all_text
return data
async def scrape_search(query: str, max_pages: int = None):
"""scrape bing search pages"""
url = f"https://www.bing.com/search?{urlencode({'q': query})}"
log.info("scraping the first search page")
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG))
serp_data = parse_serps(response)
log.info(f"scraping search pagination ({max_pages - 1} more pages)")
total_results = (max_pages - 1) * 10 # each page contains 10 results
other_pages = [
ScrapeConfig(url + f"&first={start}", **BASE_CONFIG)
for start in range(10, total_results + 10, 10)
]
# scrape the remaining search pages concurrently
async for response in SCRAPFLY.concurrent_scrape(other_pages):
data = parse_serps(response)
serp_data.extend(data)
log.success(f"scraped {len(serp_data)} search results from Bing search")
return serp_data
async def scrape_keywords(query: str):
"""scrape bing search pages for keyword data"""
url = f"https://www.bing.com/search?{urlencode({'q': query})}"
log.info("scraping Bing search for keyword data")
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, **BASE_CONFIG, render_js=True))
keyword_data = parse_keywords(response)
log.success(
f"scraped {len(keyword_data['related_keywords'])} keywords and {len(keyword_data['FAQs'])} FAQs from Bing search"
)
return keyword_data
async def scrape_rich_snippets(query: str):
"""scrape bing search pages for rich snippets data"""
url = f"https://www.bing.com/search?{urlencode({'q': query})}"
log.info("scraping Bing search for keyword data")
response = await SCRAPFLY.async_scrape(ScrapeConfig(url, asp=True, country="GB", render_js=True))
rich_snippet_data = parse_rich_snippet(response)
log.success(f"scraped {len(rich_snippet_data)} rich snippets fields from Bing search")
return rich_snippet_data