Added page request retry method

Super-Fast-Buyers · Apr 22, 2024 · 38899ee · 38899ee
1 parent d93ba81
commit 38899ee
Showing 1 changed file with 93 additions and 66 deletions.
diff --git a/auctionscraper/scraper.py b/auctionscraper/scraper.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 from playwright.sync_api import sync_playwright
 from playwright.sync_api import Page
 from datetime import date, timedelta
@@ -7,19 +9,26 @@
 # Logger
 logging.basicConfig(level=logging.DEBUG)
 
-def read_txt(txt:str):
+PAGE_DEFAULT_TIMEOUT = 90000
+MAX_RETRY = 5
+
+
+
+def read_txt(txt: str):
     """ Read subdomain (county) from txt file """
     with open(txt, 'r') as f:
         return [line.strip() for line in f.readlines()]
 
-def create_baseurl(subdomain:str, category:str) -> str:
+
+def create_baseurl(subdomain: str, category: str) -> str:
     """ Create calendar URL """
     if category not in ['foreclose', 'taxdeed']:
-        return('Please define "foreclose" or "taxdeed" in category argument')
+        return ('Please define "foreclose" or "taxdeed" in category argument')
     else:
         return f"https://{subdomain}.real{category}.com/index.cfm?zaction=USER&zmethod=CALENDAR"
 
-def create_calendar_url(baseurl:str, days=0) -> list:
+
+def create_calendar_url(baseurl: str, days=0) -> list:
     """ Get calendar pages to be scraped """
     tday = date.today() + timedelta(days=days)
     days_out = 90
@@ -34,17 +43,17 @@ def create_calendar_url(baseurl:str, days=0) -> list:
             calendar.append(baseurl + "&selCalDate=" + date_url)
     return calendar
 
-def get_calendar_list(category:str, days:int) -> list:
-  """ Get calendar url list to be scraped """
-  calendar_url = []
-  for subdomain in read_txt(f"{category}.txt"):
-      baseurl = create_baseurl(subdomain, category)
-      calendar_url += create_calendar_url(baseurl, days=days)
-  return calendar_url
+def get_calendar_list(category: str, days: int) -> list:
+    """ Get calendar url list to be scraped """
+    calendar_url = []
+    for subdomain in read_txt(f"{category}.txt"):
+        baseurl = create_baseurl(subdomain, category)
+        calendar_url += create_calendar_url(baseurl, days=days)
+    return calendar_url
 
-def parse_box(page:Page) -> list:
+def parse_box(page: Page) -> list:
     """ Parse url from box calendar """
-    calendar_box = page.query_selector_all('div[class*=CALSEL]') # could be CALSEF, CALSET, CALSELB
+    calendar_box = page.query_selector_all('div[class*=CALSEL]')  # could be CALSEF, CALSET, CALSELB
     box_url = []
     for box in calendar_box:
         day_id = box.get_attribute('dayid')
@@ -61,86 +70,104 @@ def parse_box(page:Page) -> list:
                 box_url.append(url)
     return box_url
 
-def get_box_list(urls:list) -> list:
+def page_request(page: Page, url: str, selector: str, timeout: int) \
+        -> Optional[Page]:
+    for retry_number in range(1, MAX_RETRY + 1):
+        title_selector = "#Content_Title > h1"
+        try:
+            page.goto(url)
+            title = page.wait_for_selector(title_selector,
+                                           timeout=timeout)
+            if title.text_content().upper() == 'OFFLINE':
+                logging.info('Page response status OFFLINE')
+                return None
+
+            page.wait_for_selector(selector, timeout=timeout)
+            return page
+        except Exception as e:
+            logging.info(f'RETRY: {retry_number} | error {e}')
+
+def get_box_list(urls: list) -> list:
     """ Get box url from calendar page """
     data = []
     with sync_playwright() as p:
         # open browser
         browser = p.firefox.launch()
         page = browser.new_page()
-        page.set_default_timeout(90000)
+        page.set_default_timeout(PAGE_DEFAULT_TIMEOUT)
+        selector = '.CALDAYBOX'
         for url in urls:
             # access page
             logging.debug(f"GET {url} | LEVEL 1")
-            try:
-                page.goto(url)
-                page.wait_for_selector('.CALDAYBOX')
-                # parse content
-                data += parse_box(page)
-            except Exception as e:
-                logging.warning(f"Failed to GET {url}: {e}")
+            response = page_request(page, url, selector, 5000)
+            if response is None:
+                logging.warning(f'Failed to GET {url}')
                 continue
+
+            data += parse_box(response)
         # close browser
         browser.close()
     return data
 
-def get_data(urls:list):
+def get_data(urls: list):
     """ Get auction data """
     data = []
     # open browser
     with sync_playwright() as p:
         browser = p.firefox.launch()
         page = browser.new_page()
-        page.set_default_timeout(90000)
+        page.set_default_timeout(PAGE_DEFAULT_TIMEOUT)
+        selector = '#Area_W > .AUCTION_ITEM.PREVIEW'
         for url in urls:
             # access page
             logging.debug(f"GET {url} | LEVEL 2")
-            try:
-                page.goto(url)
-                page.wait_for_selector('#Area_W > .AUCTION_ITEM.PREVIEW')
-                cards = page.query_selector_all('#Area_W > .AUCTION_ITEM.PREVIEW')
-                for card in cards:
-                    # parse date
-                    auction_date = re.sub(r'^.+AUCTIONDATE=(\d{2}/\d{2}/\d{4})$', '\\1', url)
-                    # parse fields
-                    auction_field = []
-                    for text in card.query_selector_all('tr > th'):
-                        th = text.inner_text().replace('#','').replace(':','').strip()
-                        if th == '':
-                            th = 'city'
-                        th = th.lower().replace(' ','_')
-                        auction_field.append(th)
-                    # parse content
-                    auction_content = [text.inner_text().strip() for text in card.query_selector_all('tr > td')]
-                    if len(auction_field) == len(auction_content):
-                        auction_info = {auction_field[i]:auction_content[i] for i in range(len(auction_field))}
-                        fields = list(auction_info.keys())
-                        for key in fields:
-                            if key == "city":
-                                city = auction_info[key].split(', ')[0].strip()
-                                zipcode = auction_info[key].split(',')[1].strip()
-                                try:
-                                    state = zipcode.split('-')[0].strip()
-                                    zipcode = zipcode.split('-')[1].strip()
-                                except:
-                                    state = 'FL'
-                                    zipcode = zipcode
-                                auction_info.update({
-                                    'city':city,
-                                    'state':state,
-                                    'zipcode':zipcode,
-                                    'auction_date': auction_date,
-                                })
-                    else:
-                        logging.warning(f"Length of information's fields and contents doesn't matches: {url}")
-                        continue
-                    data.append(auction_info)
-            except Exception as e:
-                logging.warning(f"Failed to GET {url}: {e}")
+            response_page = page_request(page, url, selector, 5000)
+            if response_page is None:
+                logging.warning(f'Failed to GET {url}')
                 continue
+
+            cards = response_page.query_selector_all('#Area_W > .AUCTION_ITEM.PREVIEW')
+            for card in cards:
+                # parse date
+                auction_date = re.sub(r'^.+AUCTIONDATE=(\d{2}/\d{2}/\d{4})$', '\\1', url)
+                # parse fields
+                auction_field = []
+                for text in card.query_selector_all('tr > th'):
+                    th = text.inner_text().replace('#', '').replace(':', '').strip()
+                    if th == '':
+                        th = 'city'
+                    th = th.lower().replace(' ', '_')
+                    auction_field.append(th)
+                # parse content
+                auction_content = [text.inner_text().strip() for text in card.query_selector_all('tr > td')]
+                if len(auction_field) == len(auction_content):
+                    auction_info = {auction_field[i]: auction_content[i] for i in range(len(auction_field))}
+                    fields = list(auction_info.keys())
+                    for key in fields:
+                        if key == "city":
+                            city = auction_info[key].split(', ')[0].strip()
+                            zipcode = auction_info[key].split(',')[1].strip()
+                            try:
+                                state = zipcode.split('-')[0].strip()
+                                zipcode = zipcode.split('-')[1].strip()
+                            except:
+                                state = 'FL'
+                                zipcode = zipcode
+                            auction_info.update({
+                                'city': city,
+                                'state': state,
+                                'zipcode': zipcode,
+                                'auction_date': auction_date,
+                            })
+                else:
+                    logging.warning(f"Length of information's fields and contents doesn't matches: {url}")
+                    continue
+                data.append(auction_info)
+
         # close browser
         browser.close()
     return data
 
+
 if __name__ == '__main__':
     pass