had to switch over to Pushpull API because Reddit was blocking the DigitalOcean IP

This commit is contained in:
2026-03-10 07:11:20 -05:00
parent ae358ab05c
commit 43f716a58a
3 changed files with 120 additions and 130 deletions

View File

@@ -1,3 +1,4 @@
import asyncio
import logging
from datetime import datetime, timezone, timedelta
@@ -14,7 +15,6 @@ from backend.worker.reddit_client import create_client, fetch_json
logger = logging.getLogger(__name__)
# Sync engine for worker (PRAW-replacement uses async httpx, but DB writes are sync for simplicity with APScheduler)
_engine = create_engine(settings.database_url_sync, pool_size=3, max_overflow=5, pool_recycle=3600)
SyncSession = sessionmaker(_engine)
@@ -42,23 +42,24 @@ def _upsert_author(db: Session, username: str) -> int | None:
def _parse_post(post_data: dict, subreddit_id: int, db: Session, hot_rank: int | None = None) -> dict:
data = post_data.get("data", post_data)
author_id = _upsert_author(db, data.get("author"))
created = datetime.fromtimestamp(data.get("created_utc", 0), tz=timezone.utc)
"""Parse a Pullpush submission object into a dict for DB upsert."""
author_id = _upsert_author(db, post_data.get("author"))
created = datetime.fromtimestamp(post_data.get("created_utc", 0), tz=timezone.utc)
reddit_id = post_data.get("name", f"t3_{post_data.get('id', '')}")
return {
"reddit_id": data.get("name", f"t3_{data.get('id', '')}"),
"reddit_id": reddit_id,
"subreddit_id": subreddit_id,
"author_id": author_id,
"title": data.get("title", ""),
"selftext": data.get("selftext"),
"url": data.get("url"),
"permalink": data.get("permalink"),
"flair": data.get("link_flair_text"),
"score": data.get("score", 0),
"upvote_ratio": data.get("upvote_ratio"),
"num_comments": data.get("num_comments", 0),
"is_self": data.get("is_self"),
"over_18": data.get("over_18", False),
"title": post_data.get("title", ""),
"selftext": post_data.get("selftext"),
"url": post_data.get("url"),
"permalink": post_data.get("permalink"),
"flair": post_data.get("link_flair_text"),
"score": post_data.get("score", 0),
"upvote_ratio": post_data.get("upvote_ratio"),
"num_comments": post_data.get("num_comments", 0),
"is_self": post_data.get("is_self"),
"over_18": post_data.get("over_18", False),
"hot_rank": hot_rank,
"created_utc": created,
"collected_at": datetime.now(timezone.utc),
@@ -87,14 +88,14 @@ def _upsert_posts(db: Session, posts: list[dict], update_hot_rank: bool = False)
def _parse_comment(comment_data: dict, post_id: int, db: Session, parent_map: dict) -> dict | None:
data = comment_data.get("data", comment_data)
if data.get("kind") == "more" or not data.get("body"):
"""Parse a Pullpush comment object into a dict for DB upsert."""
if not comment_data.get("body"):
return None
reddit_id = data.get("name", f"t1_{data.get('id', '')}")
author_id = _upsert_author(db, data.get("author"))
created = datetime.fromtimestamp(data.get("created_utc", 0), tz=timezone.utc)
reddit_id = comment_data.get("name", f"t1_{comment_data.get('id', '')}")
author_id = _upsert_author(db, comment_data.get("author"))
created = datetime.fromtimestamp(comment_data.get("created_utc", 0), tz=timezone.utc)
parent_reddit_id = data.get("parent_id", "")
parent_reddit_id = comment_data.get("parent_id", "")
parent_comment_id = parent_map.get(parent_reddit_id)
return {
@@ -102,19 +103,16 @@ def _parse_comment(comment_data: dict, post_id: int, db: Session, parent_map: di
"post_id": post_id,
"parent_comment_id": parent_comment_id,
"author_id": author_id,
"body": data.get("body", ""),
"score": data.get("score", 0),
"body": comment_data.get("body", ""),
"score": comment_data.get("score", 0),
"created_utc": created,
"collected_at": datetime.now(timezone.utc),
"updated_at": datetime.now(timezone.utc),
}
import asyncio
def poll_new_posts():
"""Fetch /new for each active subreddit and upsert posts."""
"""Fetch recent submissions from Pullpush for each active subreddit."""
asyncio.run(_poll_new_posts_async())
@@ -126,22 +124,27 @@ async def _poll_new_posts_async():
client = create_client()
async with client:
for sub in subreddits:
data = await fetch_json(client, f"/r/{sub['name']}/new", {"limit": "100"})
data = await fetch_json(client, "/reddit/search/submission/", {
"subreddit": sub["name"],
"sort": "created_utc",
"sort_type": "desc",
"size": 100,
})
if not data:
continue
children = data.get("data", {}).get("children", [])
if not children:
posts_data = data.get("data", [])
if not posts_data:
continue
with SyncSession() as db:
posts = [_parse_post(child, sub["id"], db) for child in children]
posts = [_parse_post(p, sub["id"], db) for p in posts_data]
_upsert_posts(db, posts)
db.commit()
logger.info(f"r/{sub['name']}: upserted {len(children)} new posts")
logger.info(f"r/{sub['name']}: upserted {len(posts_data)} new posts")
def poll_hot_posts():
"""Fetch /hot for each active subreddit and update hot_rank."""
"""Approximate hot posts by fetching recent high-scoring submissions."""
asyncio.run(_poll_hot_posts_async())
@@ -150,88 +153,91 @@ async def _poll_hot_posts_async():
if not subreddits:
return
after_epoch = int((datetime.now(timezone.utc) - timedelta(hours=24)).timestamp())
client = create_client()
async with client:
for sub in subreddits:
data = await fetch_json(client, f"/r/{sub['name']}/hot", {"limit": "100"})
data = await fetch_json(client, "/reddit/search/submission/", {
"subreddit": sub["name"],
"sort": "score",
"sort_type": "desc",
"size": 100,
"after": after_epoch,
})
if not data:
continue
children = data.get("data", {}).get("children", [])
if not children:
posts_data = data.get("data", [])
if not posts_data:
continue
with SyncSession() as db:
posts = [
_parse_post(child, sub["id"], db, hot_rank=i + 1)
for i, child in enumerate(children)
_parse_post(p, sub["id"], db, hot_rank=i + 1)
for i, p in enumerate(posts_data)
]
_upsert_posts(db, posts, update_hot_rank=True)
db.commit()
logger.info(f"r/{sub['name']}: updated hot ranks for {len(children)} posts")
logger.info(f"r/{sub['name']}: updated hot ranks for {len(posts_data)} posts")
def collect_comments():
"""Fetch comments for recent posts."""
"""Fetch recent comments from Pullpush for each active subreddit."""
asyncio.run(_collect_comments_async())
async def _collect_comments_async():
cutoff = datetime.now(timezone.utc) - timedelta(hours=48)
with SyncSession() as db:
stmt = (
select(Post.id, Post.reddit_id, Post.subreddit_id)
.join(MonitoredSubreddit)
.where(
MonitoredSubreddit.is_active == True, # noqa: E712
Post.created_utc >= cutoff,
)
.order_by(Post.created_utc.desc())
.limit(50)
)
result = db.execute(stmt)
recent_posts = [{"id": r[0], "reddit_id": r[1], "subreddit_id": r[2]} for r in result]
if not recent_posts:
subreddits = _get_active_subreddits()
if not subreddits:
return
cutoff_epoch = int((datetime.now(timezone.utc) - timedelta(hours=48)).timestamp())
client = create_client()
async with client:
for post in recent_posts:
short_id = post["reddit_id"].replace("t3_", "")
data = await fetch_json(client, f"/comments/{short_id}", {"limit": "500", "sort": "new"})
if not data or len(data) < 2:
for sub in subreddits:
data = await fetch_json(client, "/reddit/search/comment/", {
"subreddit": sub["name"],
"sort": "created_utc",
"sort_type": "desc",
"size": 100,
"after": cutoff_epoch,
})
if not data:
continue
comments_data = data.get("data", [])
if not comments_data:
continue
comment_listing = data[1].get("data", {}).get("children", [])
with SyncSession() as db:
# Build parent_map from existing comments
existing = db.execute(
select(Comment.id, Comment.reddit_id).where(Comment.post_id == post["id"])
# Build lookup: reddit post fullname -> our DB post ID
link_ids = {c.get("link_id") for c in comments_data if c.get("link_id")}
if not link_ids:
continue
result = db.execute(
select(Post.id, Post.reddit_id).where(Post.reddit_id.in_(link_ids))
)
parent_map = {r[1]: r[0] for r in existing}
post_lookup = {reddit_id: post_id for post_id, reddit_id in result}
# Build parent_map: comment reddit_id -> our DB comment ID
existing = db.execute(
select(Comment.id, Comment.reddit_id)
.join(Post)
.where(Post.subreddit_id == sub["id"])
)
parent_map = {reddit_id: cid for cid, reddit_id in existing}
comments_to_upsert = []
def process_comments(children):
for child in children:
if child.get("kind") == "more":
continue
c_data = child.get("data", {})
parsed = _parse_comment(c_data, post["id"], db, parent_map)
if parsed:
comments_to_upsert.append(parsed)
# Process replies recursively
replies = c_data.get("replies")
if isinstance(replies, dict):
reply_children = replies.get("data", {}).get("children", [])
process_comments(reply_children)
process_comments(comment_listing)
for c in comments_data:
post_id = post_lookup.get(c.get("link_id"))
if not post_id:
continue # Post not in our DB yet
parsed = _parse_comment(c, post_id, db, parent_map)
if parsed:
comments_to_upsert.append(parsed)
if comments_to_upsert:
# Upsert comments one at a time to handle parent references
for comment in comments_to_upsert:
stmt = insert(Comment).values(comment)
stmt = stmt.on_conflict_do_update(
@@ -244,52 +250,40 @@ async def _collect_comments_async():
)
db.execute(stmt)
db.commit()
logger.info(f"Post {short_id}: upserted {len(comments_to_upsert)} comments")
logger.info(f"r/{sub['name']}: upserted {len(comments_to_upsert)} comments")
def update_scores():
"""Re-fetch recent posts to update scores and comment counts."""
"""Re-fetch recent posts to capture any score updates in Pullpush."""
asyncio.run(_update_scores_async())
async def _update_scores_async():
cutoff = datetime.now(timezone.utc) - timedelta(days=7)
with SyncSession() as db:
stmt = (
select(Post.reddit_id, Post.subreddit_id, MonitoredSubreddit.name)
.join(MonitoredSubreddit)
.where(
MonitoredSubreddit.is_active == True, # noqa: E712
Post.created_utc >= cutoff,
)
)
result = db.execute(stmt)
posts_by_sub: dict[str, list[str]] = {}
for reddit_id, _, sub_name in result:
posts_by_sub.setdefault(sub_name, []).append(reddit_id)
if not posts_by_sub:
subreddits = _get_active_subreddits()
if not subreddits:
return
# Score updates piggyback on the new/hot polls — the upsert already updates scores.
# This job explicitly re-fetches to catch score changes on older posts.
after_epoch = int((datetime.now(timezone.utc) - timedelta(days=7)).timestamp())
client = create_client()
async with client:
for sub_name, reddit_ids in posts_by_sub.items():
data = await fetch_json(client, f"/r/{sub_name}/new", {"limit": "100"})
for sub in subreddits:
data = await fetch_json(client, "/reddit/search/submission/", {
"subreddit": sub["name"],
"sort": "created_utc",
"sort_type": "desc",
"size": 100,
"after": after_epoch,
})
if not data:
continue
children = data.get("data", {}).get("children", [])
posts_data = data.get("data", [])
if not posts_data:
continue
with SyncSession() as db:
sub = db.execute(
select(MonitoredSubreddit).where(MonitoredSubreddit.name == sub_name)
).scalar_one_or_none()
if not sub:
continue
posts = [_parse_post(child, sub.id, db) for child in children]
posts = [_parse_post(p, sub["id"], db) for p in posts_data]
_upsert_posts(db, posts)
db.commit()
logger.info(f"Score update complete for {len(posts_by_sub)} subreddits")
logger.info(f"Score update complete for {len(subreddits)} subreddits")