From 5c08c9679c91b70a6802ca3570691e1a07003294 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 4 Apr 2024 15:55:44 -0700 Subject: [PATCH] fix issue with incorrect number of total pages if any of the seeds is a redirect (#1649) Following changes in webrecorder/browsertrix-crawler#475, webrecorder/browsertrix-crawler#509, the crawler adds a redirected seed to the seen list. To account for this, it needs to be subtracted to get the total page count. --- backend/btrixcloud/operator/crawls.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/backend/btrixcloud/operator/crawls.py b/backend/btrixcloud/operator/crawls.py index 98fdae105a..5a2e19e4a1 100644 --- a/backend/btrixcloud/operator/crawls.py +++ b/backend/btrixcloud/operator/crawls.py @@ -1178,6 +1178,11 @@ async def get_redis_crawl_stats( pages_done = await redis.llen(f"{crawl_id}:d") pages_found = await redis.scard(f"{crawl_id}:s") + # account for extra seeds and subtract from seen list + extra_seeds = await redis.llen(f"{crawl_id}:extraSeeds") + if extra_seeds: + pages_found -= extra_seeds + sizes = await redis.hgetall(f"{crawl_id}:size") archive_size = sum(int(x) for x in sizes.values())