Skip to content

Commit

Permalink
Merge pull request #23 from scrapfly/clob-blob
Browse files Browse the repository at this point in the history
support clob and blob object handling
  • Loading branch information
mazen-r authored Aug 13, 2024
2 parents df39084 + e72ae15 commit 550996d
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions scrapfly/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,37 @@ def sink(self, api_response:ScrapeApiResponse, content:Optional[Union[str, bytes
logger.info('file %s created' % file_path)
return file_path

def _handle_scrape_large_objects(
self,
body: Dict,
format: Literal['clob', 'blob']
) -> Dict:
request_data = {
'method': 'GET',
'url': body['result']['content'],
'verify': self.verify,
'timeout': (self.connect_timeout, self.read_timeout),
'headers': {
'accept-encoding': self.body_handler.content_encoding,
'accept': self.body_handler.accept,
'user-agent': self.ua
},
'params': {'key': self.key}
}
response = self._http_handler(**request_data)
if self.body_handler.support(headers=response.headers):
content = self.body_handler(content=response.content, content_type=response.headers['content-type'])
else:
content = response.content.decode('utf-8')

body['result']['content'] = content
if format == 'clob':
body['result']['format'] = 'text'
if format == 'blob':
body['result']['format'] = 'binary'

return body

def _handle_api_response(
self,
response: Response,
Expand All @@ -748,6 +779,10 @@ def _handle_api_response(
else:
body = response.content.decode('utf-8')

content_format = body['result']['format']
if content_format in ['clob', 'blob']:
body = self._handle_scrape_large_objects(body=body, format=content_format)

api_response:ScrapeApiResponse = ScrapeApiResponse(
response=response,
request=response.request,
Expand Down

0 comments on commit 550996d

Please sign in to comment.