Skip to content

Commit

Permalink
feat: add progress tracking for collection metadata retrieval
Browse files Browse the repository at this point in the history
  • Loading branch information
Ovler-Young committed Nov 19, 2024
1 parent f10e324 commit 886cdf7
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 16 deletions.
9 changes: 6 additions & 3 deletions src/ia_collection_analyzer/libs.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def is_cache_valid(filename, ttl: float | int) -> bool:
return (time.time() - file_mtime) < ttl


def get_collection(collection_id) -> list:
def get_collection(collection_id, progress_hook=None) -> list:
cache_key = f"collection_{collection_id}"
cache_filename = get_cache_filename(cache_key)

Expand All @@ -61,6 +61,7 @@ def get_collection(collection_id) -> list:
fields=["*"],
)
collection = []
total_items = search.num_found
for result in tqdm(
search, desc=f"Fetching {collection_id}", total=search.num_found
):
Expand All @@ -72,6 +73,8 @@ def get_collection(collection_id) -> list:
metadata = result
with open(item_cache_filename, "w") as cache_file:
json.dump(metadata, cache_file)
if progress_hook:
progress_hook(1, total_items)

with open(cache_filename, "w") as cache_file:
json.dump(collection, cache_file, indent=2)
Expand Down Expand Up @@ -100,8 +103,8 @@ def get_item_metadata(item_id) -> dict:
return metadata


def get_collection_items_metadata(collection_id) -> list[dict]:
metadatas = get_collection(collection_id)
def get_collection_items_metadata(collection_id, progress_hook=None) -> list[dict]:
metadatas = get_collection(collection_id, progress_hook)
return metadatas


Expand Down
39 changes: 26 additions & 13 deletions src/ia_collection_analyzer/streamlit.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pandas as pd
import numpy as np
from libs import get_collection_items_metadata
import time

st.title("Internet Archive Collection Analyzer")

Expand All @@ -10,7 +11,7 @@
)

# input the collection name
col1, col2 = st.columns([3, 1], vertical_alignment="bottom")
col1, col2 = st.columns([6, 1], vertical_alignment="bottom")
with col1:
collection_id = st.text_input("Enter the collection ID:", "speedydeletionwiki")
with col2:
Expand All @@ -19,25 +20,37 @@
if not conform_button:
st.stop()

# display we're getting the metadata
progress_text = st.text(
f"Getting metadata for collection: {collection_id}, please wait..."

guide_text = st.markdown(
f"Getting metadata for collection: **{collection_id}**:"
)

progress_bar = st.progress(0)
items = get_collection_items_metadata(collection_id)
progress_text.text(
f"Getting metadata for collection: {collection_id}, transforming data..."
current_progress = 0
start_time = time.time()
progress_text = st.markdown("getting count and estimating time...")

def progress_hook(add, total):
global current_progress
current_progress += add
progress = current_progress / total
progress_bar.progress(progress)
progress_text.markdown(f"`{current_progress}/{total}` items processed, `{progress*100:.2f}%` done, elapsed time: `{time.time() - start_time:.2f}s`, ETA: `{((time.time() - start_time) / progress) * (1 - progress):.2f}s`")

items = get_collection_items_metadata(collection_id, progress_hook)

progress_bar.progress(100)

data_transform_text = st.text(
"transforming data..."
)
progress_bar.progress(95)
items_pd = pd.DataFrame(items)
progress_text.text(
f"Getting metadata for collection: {collection_id}, cleaning data..."
data_transform_text.text(
"cleaning data..."
)
# drop columns with 80%+ nan
items_pd = items_pd.dropna(axis=1, thresh=0.8 * len(items_pd))
progress_text.text(f"Getting metadata for collection: {collection_id}, done!")

progress_bar.progress(100)
data_transform_text.text("Data transformation and cleaning complete!")

st.write("The collection contains the following items:")
st.write(items_pd)

0 comments on commit 886cdf7

Please sign in to comment.