-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping_headers.py
55 lines (37 loc) · 1.49 KB
/
scraping_headers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import requests
from bs4 import BeautifulSoup
url = "https://codeavecjonathan.com/scraping/recette_ua/"
headers = { "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:120.0) Gecko/20100101 Firefox/120.0" }
def get_text_if_not_none(e):
# Return the text of html element if available or None if not
if e:
return e.text.strip()
return None
response = requests.get(url, headers=headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
print(f"OK - RESPONSE CODE = {response.status_code}")
html = response.text
# print(html)
# Save the source into html file
src_file = open("source.html", "w")
src_file.write(html)
src_file.close()
# Parsing and extracting the data
soup = BeautifulSoup(html, "html5lib")
title = soup.find("h1").text
print(f"Recipe: {title}")
description = get_text_if_not_none(soup.find("p", class_="description"))
print(f"Description: \n{description}")
div_ingredients = soup.find("div", class_="ingredients")
ingredients = div_ingredients.find_all("p")
print("Ingredients:")
for ingredient in ingredients:
print(f"- {get_text_if_not_none(ingredient)}")
preparation_table = soup.find("table", class_="preparation")
steps = preparation_table.find_all("td", class_="preparation_etape")
print("Preparation:")
for step in steps:
print(f"- {get_text_if_not_none(step)}")
else:
print(f"ERROR - RESPONSE CODE = {response.status_code}")