Skip to content

Commit

Permalink
Added page request retry method
Browse files Browse the repository at this point in the history
  • Loading branch information
kaminyv committed Apr 22, 2024
1 parent d93ba81 commit 38899ee
Showing 1 changed file with 93 additions and 66 deletions.
159 changes: 93 additions & 66 deletions auctionscraper/scraper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Optional

from playwright.sync_api import sync_playwright
from playwright.sync_api import Page
from datetime import date, timedelta
Expand All @@ -7,19 +9,26 @@
# Logger
logging.basicConfig(level=logging.DEBUG)

def read_txt(txt:str):
PAGE_DEFAULT_TIMEOUT = 90000
MAX_RETRY = 5



def read_txt(txt: str):
""" Read subdomain (county) from txt file """
with open(txt, 'r') as f:
return [line.strip() for line in f.readlines()]

def create_baseurl(subdomain:str, category:str) -> str:

def create_baseurl(subdomain: str, category: str) -> str:
""" Create calendar URL """
if category not in ['foreclose', 'taxdeed']:
return('Please define "foreclose" or "taxdeed" in category argument')
return ('Please define "foreclose" or "taxdeed" in category argument')
else:
return f"https://{subdomain}.real{category}.com/index.cfm?zaction=USER&zmethod=CALENDAR"

def create_calendar_url(baseurl:str, days=0) -> list:

def create_calendar_url(baseurl: str, days=0) -> list:
""" Get calendar pages to be scraped """
tday = date.today() + timedelta(days=days)
days_out = 90
Expand All @@ -34,17 +43,17 @@ def create_calendar_url(baseurl:str, days=0) -> list:
calendar.append(baseurl + "&selCalDate=" + date_url)
return calendar

def get_calendar_list(category:str, days:int) -> list:
""" Get calendar url list to be scraped """
calendar_url = []
for subdomain in read_txt(f"{category}.txt"):
baseurl = create_baseurl(subdomain, category)
calendar_url += create_calendar_url(baseurl, days=days)
return calendar_url
def get_calendar_list(category: str, days: int) -> list:
""" Get calendar url list to be scraped """
calendar_url = []
for subdomain in read_txt(f"{category}.txt"):
baseurl = create_baseurl(subdomain, category)
calendar_url += create_calendar_url(baseurl, days=days)
return calendar_url

def parse_box(page:Page) -> list:
def parse_box(page: Page) -> list:
""" Parse url from box calendar """
calendar_box = page.query_selector_all('div[class*=CALSEL]') # could be CALSEF, CALSET, CALSELB
calendar_box = page.query_selector_all('div[class*=CALSEL]') # could be CALSEF, CALSET, CALSELB
box_url = []
for box in calendar_box:
day_id = box.get_attribute('dayid')
Expand All @@ -61,86 +70,104 @@ def parse_box(page:Page) -> list:
box_url.append(url)
return box_url

def get_box_list(urls:list) -> list:
def page_request(page: Page, url: str, selector: str, timeout: int) \
-> Optional[Page]:
for retry_number in range(1, MAX_RETRY + 1):
title_selector = "#Content_Title > h1"
try:
page.goto(url)
title = page.wait_for_selector(title_selector,
timeout=timeout)
if title.text_content().upper() == 'OFFLINE':
logging.info('Page response status OFFLINE')
return None

page.wait_for_selector(selector, timeout=timeout)
return page
except Exception as e:
logging.info(f'RETRY: {retry_number} | error {e}')

def get_box_list(urls: list) -> list:
""" Get box url from calendar page """
data = []
with sync_playwright() as p:
# open browser
browser = p.firefox.launch()
page = browser.new_page()
page.set_default_timeout(90000)
page.set_default_timeout(PAGE_DEFAULT_TIMEOUT)
selector = '.CALDAYBOX'
for url in urls:
# access page
logging.debug(f"GET {url} | LEVEL 1")
try:
page.goto(url)
page.wait_for_selector('.CALDAYBOX')
# parse content
data += parse_box(page)
except Exception as e:
logging.warning(f"Failed to GET {url}: {e}")
response = page_request(page, url, selector, 5000)
if response is None:
logging.warning(f'Failed to GET {url}')
continue

data += parse_box(response)
# close browser
browser.close()
return data

def get_data(urls:list):
def get_data(urls: list):
""" Get auction data """
data = []
# open browser
with sync_playwright() as p:
browser = p.firefox.launch()
page = browser.new_page()
page.set_default_timeout(90000)
page.set_default_timeout(PAGE_DEFAULT_TIMEOUT)
selector = '#Area_W > .AUCTION_ITEM.PREVIEW'
for url in urls:
# access page
logging.debug(f"GET {url} | LEVEL 2")
try:
page.goto(url)
page.wait_for_selector('#Area_W > .AUCTION_ITEM.PREVIEW')
cards = page.query_selector_all('#Area_W > .AUCTION_ITEM.PREVIEW')
for card in cards:
# parse date
auction_date = re.sub(r'^.+AUCTIONDATE=(\d{2}/\d{2}/\d{4})$', '\\1', url)
# parse fields
auction_field = []
for text in card.query_selector_all('tr > th'):
th = text.inner_text().replace('#','').replace(':','').strip()
if th == '':
th = 'city'
th = th.lower().replace(' ','_')
auction_field.append(th)
# parse content
auction_content = [text.inner_text().strip() for text in card.query_selector_all('tr > td')]
if len(auction_field) == len(auction_content):
auction_info = {auction_field[i]:auction_content[i] for i in range(len(auction_field))}
fields = list(auction_info.keys())
for key in fields:
if key == "city":
city = auction_info[key].split(', ')[0].strip()
zipcode = auction_info[key].split(',')[1].strip()
try:
state = zipcode.split('-')[0].strip()
zipcode = zipcode.split('-')[1].strip()
except:
state = 'FL'
zipcode = zipcode
auction_info.update({
'city':city,
'state':state,
'zipcode':zipcode,
'auction_date': auction_date,
})
else:
logging.warning(f"Length of information's fields and contents doesn't matches: {url}")
continue
data.append(auction_info)
except Exception as e:
logging.warning(f"Failed to GET {url}: {e}")
response_page = page_request(page, url, selector, 5000)
if response_page is None:
logging.warning(f'Failed to GET {url}')
continue

cards = response_page.query_selector_all('#Area_W > .AUCTION_ITEM.PREVIEW')
for card in cards:
# parse date
auction_date = re.sub(r'^.+AUCTIONDATE=(\d{2}/\d{2}/\d{4})$', '\\1', url)
# parse fields
auction_field = []
for text in card.query_selector_all('tr > th'):
th = text.inner_text().replace('#', '').replace(':', '').strip()
if th == '':
th = 'city'
th = th.lower().replace(' ', '_')
auction_field.append(th)
# parse content
auction_content = [text.inner_text().strip() for text in card.query_selector_all('tr > td')]
if len(auction_field) == len(auction_content):
auction_info = {auction_field[i]: auction_content[i] for i in range(len(auction_field))}
fields = list(auction_info.keys())
for key in fields:
if key == "city":
city = auction_info[key].split(', ')[0].strip()
zipcode = auction_info[key].split(',')[1].strip()
try:
state = zipcode.split('-')[0].strip()
zipcode = zipcode.split('-')[1].strip()
except:
state = 'FL'
zipcode = zipcode
auction_info.update({
'city': city,
'state': state,
'zipcode': zipcode,
'auction_date': auction_date,
})
else:
logging.warning(f"Length of information's fields and contents doesn't matches: {url}")
continue
data.append(auction_info)

# close browser
browser.close()
return data


if __name__ == '__main__':
pass

0 comments on commit 38899ee

Please sign in to comment.