Skip to content

Commit

Permalink
add scrape more users
Browse files Browse the repository at this point in the history
  • Loading branch information
datawhores committed Nov 30, 2023
1 parent 8847f68 commit f6fa346
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 11 deletions.
47 changes: 37 additions & 10 deletions ofscraper/api/subscriptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,24 +25,51 @@


async def get_subscriptions(headers, subscribe_count):
offsets = range(0, subscribe_count, 10)
tasks = [scrape_subscriptions(headers, offset) for offset in offsets]
subscriptions = await asyncio.gather(*tasks)
return list(chain.from_iterable(subscriptions))
global tasks
global new_tasks
tasks = [asyncio.create_task(scrape_subscriptions(headers, offset)) for offset in range(0, subscribe_count+1, 10)]
tasks.extend([asyncio.create_task(scrape_subscriptions(headers, subscribe_count+1,recurs=True))] )
new_tasks=[]
out=[]
while tasks:
done, pending = await asyncio.wait(tasks, return_when=asyncio.FIRST_COMPLETED)
for result in done:
try:
result=await result
except Exception as E:
log.debug(E)
continue
out.extend(result)
tasks = list(pending)
tasks.extend(new_tasks)
new_tasks=[]
return out





@retry(stop=stop_after_attempt(NUM_TRIES),wait=wait_random(min=constants.OF_MIN, max=constants.OF_MAX),reraise=True)
async def scrape_subscriptions(headers, offset=0) -> list:
async def scrape_subscriptions(headers, offset=0,recurs=False) -> list:
async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=constants.API_REEQUEST_TIMEOUT, connect=None,sock_connect=None, sock_read=None)) as c:
url = subscriptionsEP.format(offset)
headers=auth.make_headers(auth.read_auth())
headers=auth.create_sign(url, headers)
async with c.request("get",url,ssl=ssl.create_default_context(cafile=certifi.where()),cookies=auth.add_cookies_aio(),headers=headers) as r:
if r.ok:
subscriptions = await r.json()
log.debug(f"usernames offset {offset}: usernames retrived -> {list(map(lambda x:x.get('username'),subscriptions))}")
return subscriptions
try:
async with c.request("get",url,ssl=ssl.create_default_context(cafile=certifi.where()),cookies=auth.add_cookies_aio(),headers=headers) as r:
if r.ok:
data=await r.json()
if len(data)==0:
None
elif recurs:
new_tasks.append(asyncio.create_task(scrape_subscriptions(c,recurs=True,offset=offset+len(data))))
log.debug(f"usernames offset {offset}: usernames retrived -> {list(map(lambda x:x.get('username'),data))}")
return data

r.raise_for_status()
except Exception as E:
log.debug(E)
raise E

def parse_subscriptions(subscriptions: list) -> list:
datenow=arrow.now()
Expand Down
2 changes: 1 addition & 1 deletion wiki
Submodule wiki updated 60 files
+ .gitbook/assets/image (1) (1).png
+ .gitbook/assets/image (1) (2).png
+ .gitbook/assets/image (1).png
+ .gitbook/assets/image (10).png
+ .gitbook/assets/image (11).png
+ .gitbook/assets/image (12).png
+ .gitbook/assets/image (2) (1).png
+ .gitbook/assets/image (2).png
+ .gitbook/assets/image (3).png
+ .gitbook/assets/image (4) (1).png
+ .gitbook/assets/image (4).png
+ .gitbook/assets/image (5).png
+ .gitbook/assets/image (6).png
+ .gitbook/assets/image (7).png
+ .gitbook/assets/image (8).png
+ .gitbook/assets/image (9).png
+ .gitbook/assets/image.png
+ .gitbook/assets/image2.png
+ .gitbook/assets/img1 (1).png
+ .gitbook/assets/img1 (2).png
+ .gitbook/assets/img1.png
+0 −1 README
+8 −0 README.md
+32 −0 SUMMARY.md
+110 −0 auth.md
+80 −0 batch-scraping-and-bot-actions/README.md
+94 −0 batch-scraping-and-bot-actions/advanced-args.md
+12 −0 batch-scraping-and-bot-actions/deleted-models.md
+21 −0 batch-scraping-and-bot-actions/liking-post.md
+31 −0 batch-scraping-and-bot-actions/liking-unliking-post.md
+119 −0 batch-scraping-and-bot-actions/model-selection-sorting.md
+58 −0 batch-scraping-and-bot-actions/page-1.md
+108 −0 batch-scraping-and-bot-actions/selecting-posts.md
+149 −0 cdm-options.md
+694 −0 command-reference.md
+201 −0 config-options.md
+186 −0 config-options/README.md
+173 −0 config-options/advanced-config-options.md
+189 −0 config-options/customizing-save-path.md
+30 −0 config-options/key-option.md
+77 −0 config-options/scraping-individual-posts-or-media.md
+51 −0 config-options/setting-metadata-path.md
+11 −0 config-options/testing.md
+99 −0 config-options/username-selection-or-fuzzy-search.md
+313 −0 content-check-modes.md
+106 −0 controlling-output.md
+48 −0 docker.md
+104 −0 faq.md
+26 −0 getting-started.md
+71 −0 installation.md
+156 −0 installation/README.md
+75 −0 installation/pip-install.md
+181 −0 installation/pre-install-guide.md
+44 −0 installation/release-info.md
+80 −0 migrating-from-digitalcriminals-script.md
+7 −0 request-change-to-doc.md
+74 −0 scraping-individual-posts-or-media.md
+11 −0 testing.md
+44 −0 using-prompts/README.md
+93 −0 using-prompts/username-selection-or-fuzzy-search.md

0 comments on commit f6fa346

Please sign in to comment.