Skip to content

Commit

Permalink
remove cache from timeline
Browse files Browse the repository at this point in the history
  • Loading branch information
datawhores committed Dec 19, 2023
1 parent e6176df commit f4d316a
Show file tree
Hide file tree
Showing 4 changed files with 30 additions and 95 deletions.
106 changes: 12 additions & 94 deletions ofscraper/api/timeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from concurrent.futures import ThreadPoolExecutor

import arrow
from diskcache import Cache
from rich.console import Group
from rich.live import Live
from rich.panel import Panel
Expand All @@ -25,13 +24,10 @@
import ofscraper.constants as constants
import ofscraper.db.operations as operations
import ofscraper.utils.args as args_
import ofscraper.utils.config as config_
import ofscraper.utils.console as console
from ofscraper.classes.semaphoreDelayed import semaphoreDelayed
from ofscraper.utils.run_async import run

from ..utils.paths import getcachepath

log = logging.getLogger("shared")
attempt = contextvars.ContextVar("attempt")

Expand Down Expand Up @@ -167,21 +163,12 @@ async def get_timeline_media(model_id, username, after=None):
min_posts = 50
responseArray = []
page_count = 0
setCache = None

cache = Cache(
getcachepath(), disk=config_.get_cache_mode(config_.read_config())
)
if not args_.getargs().no_cache:
setCache = (
True
if (args_.getargs().after == 0 or not args_.getargs().after)
else False
oldtimeline = operations.get_timeline_postdates(
model_id=model_id, username=username
)
oldtimeline = cache.get(f"timeline_{model_id}", default=[])
else:
oldtimeline = []
setCache = False
log.trace(
"oldtimeline {posts}".format(
posts="\n\n".join(
Expand All @@ -190,11 +177,9 @@ async def get_timeline_media(model_id, username, after=None):
)
)
log.debug(f"[bold]Timeline Cache[/bold] {len(oldtimeline)} found")
oldtimeline = list(
filter(lambda x: x.get("postedAtPrecise") != None, oldtimeline)
)
oldtimeline = list(filter(lambda x: x != None, oldtimeline))
postedAtArray = sorted(
list(map(lambda x: float(x["postedAtPrecise"]), oldtimeline))
list(map(lambda x: arrow.get(x).float_timestamp, oldtimeline))
)
after = after or get_after(model_id, username)

Expand Down Expand Up @@ -314,68 +299,6 @@ async def get_timeline_media(model_id, username, after=None):
)
)
log.debug(f"[bold]Timeline Count without Dupes[/bold] {len(unduped)} found")
if setCache:
newcache = {}
for post in oldtimeline + list(unduped.values()):
id = post["id"]
if newcache.get(id):
continue
newcache[id] = {
"id": post.get("id"),
"postedAtPrecise": post.get("postedAtPrecise"),
}
cache.set(
f"timeline_{model_id}",
list(newcache.values()),
expire=constants.RESPONSE_EXPIRY,
)
newCheck = {}
for post in cache.get(f"timeline_check_{model_id}", []) + list(
unduped.values()
):
newCheck[post["id"]] = post
cache.set(
f"timeline_check_{model_id}",
list(newCheck.values()),
expire=constants.DAY_SECONDS,
)
cache.close()
if setCache:
lastpost = cache.get(f"timeline_{model_id}_lastpost")
post = sorted(newcache.values(), key=lambda x: x.get("postedAtPrecise"))
if len(post) > 0:
post = post[-1]
if not lastpost:
cache.set(
f"timeline_{model_id}_lastpost",
(float(post["postedAtPrecise"]), post["id"]),
)
cache.close()
if lastpost and float(post["postedAtPrecise"]) > lastpost[0]:
cache.set(
f"timeline_{model_id}_lastpost",
(float(post["postedAtPrecise"]), post["id"]),
)
cache.close()

if setCache and after == 0:
firstpost = cache.get(f"timeline_{model_id}_firstpost")
post = sorted(newcache.values(), key=lambda x: x.get("postedAtPrecise"))
if len(post) > 0:
post = post[0]
if not firstpost:
cache.set(
f"timeline_{model_id}_firstpost",
(float(post["postedAtPrecise"]), post["id"]),
)
cache.close()
if firstpost and float(post["postedAtPrecise"]) < firstpost[0]:
cache.set(
f"timeline_{model_id}_firstpost",
(float(post["postedAtPrecise"]), post["id"]),
)
cache.close()

return list(unduped.values())


Expand All @@ -391,25 +314,20 @@ def get_individual_post(id, c=None):


def get_after(model_id, username):
cache = Cache(getcachepath(), disk=config_.get_cache_mode(config_.read_config()))
if args_.getargs().after:
return args_.getargs().after.float_timestamp
if not cache.get(f"timeline_{model_id}_lastpost") or not cache.get(
f"timeline_{model_id}_firstpost"
):
log.debug("last date or first date not found in cache")
return 0

curr = operations.get_timeline_media(model_id=model_id, username=username)
if len(curr) == 0:
log.debug("Setting date to zero because database is empty")
return 0

elif len(list(filter(lambda x: x[-2] == 0, curr))) == 0:
num_missing = len(list(filter(lambda x: x[-2] == 0, curr)))
if num_missing == 1:
log.debug("Using last db date because,all downloads in db marked as downloaded")
return arrow.get(
operations.get_last_timeline_date(model_id=model_id, username=username)
).float_timestamp
else:
log.debug(
"Using cache for date because,all downloads in db marked as downloaded"
f"Setting date to zero because {num_missing} posts in db are marked as undownloaded"
)
return cache.get(f"timeline_{model_id}_lastpost")[0]
else:
log.debug("Setting date to zero because all other test failed")
return 0
2 changes: 1 addition & 1 deletion ofscraper/classes/posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ def id(self):

@property
def date(self):
return self._post.get("createdAt") or self._post.get("postedAt")
return self._post.get("postedAt") or self._post.get("createdAt")

# modify verison of post date
@property
Expand Down
13 changes: 13 additions & 0 deletions ofscraper/db/operations.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,14 @@ def write_post_table(posts: list, model_id=None, username=None, conn=None):
conn.commit()


@operation_wrapper
def get_timeline_postdates(model_id=None, username=None, conn=None, **kwargs) -> list:
with contextlib.closing(conn.cursor()) as cur:
cur.execute(queries.postDates)
conn.commit()
return list(map(lambda x: x[0], cur.fetchall()))


@operation_wrapper
def create_post_table(model_id=None, username=None, conn=None):
with contextlib.closing(conn.cursor()) as cur:
Expand Down Expand Up @@ -379,6 +387,11 @@ def get_timeline_media(model_id=None, username=None, conn=None) -> list:
return data


def get_last_timeline_date(model_id=None, username=None):
data = get_timeline_postdates(model_id=model_id, username=username)
return sorted(data, key=lambda x: arrow.get(x).float_timestamp)[-1]


@operation_wrapper
def get_archived_media(conn=None, **kwargs) -> list:
with contextlib.closing(conn.cursor()) as cur:
Expand Down
4 changes: 4 additions & 0 deletions ofscraper/db/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,3 +212,7 @@
labelID = """
SELECT id,post_id FROM labels
"""

postDates = """
SELECT created_at FROM posts
"""

0 comments on commit f4d316a

Please sign in to comment.