Add Reddit monitoring bot — backend, frontend, and Docker config

Python/FastAPI backend with PostgreSQL for collecting Reddit data via
public .json endpoints. React/Vite dashboard for analytics. Docker Compose
setup with API and worker services connecting to shared PostgreSQL.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-09 19:29:58 -05:00
parent aaa240dbf0
commit bc2203524f
76 changed files with 7570 additions and 0 deletions

0
backend/__init__.py Normal file
View File

18
backend/config.py Normal file
View File

@@ -0,0 +1,18 @@
from pydantic_settings import BaseSettings
class Settings(BaseSettings):
database_url: str = "postgresql+asyncpg://reddit:changeme@localhost:5432/reddit_monitor"
reddit_user_agent: str = "reddit-monitor:v1.0"
seed_subreddits: str = ""
digest_hour_utc: int = 23
ai_summary_enabled: bool = False
@property
def database_url_sync(self) -> str:
return self.database_url.replace("+asyncpg", "+psycopg2")
model_config = {"env_file": ".env", "env_file_encoding": "utf-8"}
settings = Settings()

18
backend/database.py Normal file
View File

@@ -0,0 +1,18 @@
from sqlalchemy.ext.asyncio import async_sessionmaker, create_async_engine, AsyncSession
from collections.abc import AsyncGenerator
from backend.config import settings
engine = create_async_engine(
settings.database_url,
pool_size=5,
max_overflow=10,
pool_recycle=3600,
)
async_session = async_sessionmaker(engine, expire_on_commit=False)
async def get_db() -> AsyncGenerator[AsyncSession, None]:
async with async_session() as session:
yield session

39
backend/main.py Normal file
View File

@@ -0,0 +1,39 @@
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from backend.database import engine
from backend.routers import health, subreddits, posts, comments, authors, analytics, digests, summaries
@asynccontextmanager
async def lifespan(app: FastAPI):
yield
await engine.dispose()
app = FastAPI(title="Reddit Monitor", lifespan=lifespan)
# API routes
app.include_router(health.router, prefix="/api/v1")
app.include_router(subreddits.router, prefix="/api/v1")
app.include_router(posts.router, prefix="/api/v1")
app.include_router(comments.router, prefix="/api/v1")
app.include_router(authors.router, prefix="/api/v1")
app.include_router(analytics.router, prefix="/api/v1")
app.include_router(digests.router, prefix="/api/v1")
app.include_router(summaries.router, prefix="/api/v1")
# SPA static file serving (only when frontend is built)
import os
static_dir = os.path.join(os.path.dirname(__file__), "..", "static")
if os.path.isdir(static_dir):
app.mount("/assets", StaticFiles(directory=os.path.join(static_dir, "assets")), name="assets")
@app.get("/{full_path:path}")
async def serve_spa(full_path: str):
return FileResponse(os.path.join(static_dir, "index.html"))

View File

@@ -0,0 +1,19 @@
from backend.models.base import Base
from backend.models.subreddit import MonitoredSubreddit
from backend.models.author import Author
from backend.models.post import Post
from backend.models.comment import Comment
from backend.models.metric_snapshot import MetricSnapshot
from backend.models.daily_digest import DailyDigest
from backend.models.summary import Summary
__all__ = [
"Base",
"MonitoredSubreddit",
"Author",
"Post",
"Comment",
"MetricSnapshot",
"DailyDigest",
"Summary",
]

23
backend/models/author.py Normal file
View File

@@ -0,0 +1,23 @@
from datetime import datetime, timezone
from sqlalchemy import String, Integer, DateTime
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class Author(Base):
__tablename__ = "authors"
id: Mapped[int] = mapped_column(primary_key=True)
username: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
first_seen_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
last_seen_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
total_posts: Mapped[int] = mapped_column(Integer, default=0)
total_comments: Mapped[int] = mapped_column(Integer, default=0)
posts: Mapped[list["Post"]] = relationship(back_populates="author") # noqa: F821
comments: Mapped[list["Comment"]] = relationship(back_populates="author") # noqa: F821

5
backend/models/base.py Normal file
View File

@@ -0,0 +1,5 @@
from sqlalchemy.orm import DeclarativeBase
class Base(DeclarativeBase):
pass

34
backend/models/comment.py Normal file
View File

@@ -0,0 +1,34 @@
from datetime import datetime, timezone
from sqlalchemy import String, Integer, DateTime, ForeignKey
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class Comment(Base):
__tablename__ = "comments"
id: Mapped[int] = mapped_column(primary_key=True)
reddit_id: Mapped[str] = mapped_column(String(20), unique=True, nullable=False)
post_id: Mapped[int] = mapped_column(ForeignKey("posts.id"), nullable=False, index=True)
parent_comment_id: Mapped[int | None] = mapped_column(
ForeignKey("comments.id"), index=True
)
author_id: Mapped[int | None] = mapped_column(ForeignKey("authors.id"), index=True)
body: Mapped[str] = mapped_column(nullable=False)
score: Mapped[int] = mapped_column(Integer, default=0)
created_utc: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False)
collected_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=lambda: datetime.now(timezone.utc),
onupdate=lambda: datetime.now(timezone.utc),
)
post: Mapped["Post"] = relationship(back_populates="comments") # noqa: F821
author: Mapped["Author | None"] = relationship(back_populates="comments") # noqa: F821
parent_comment: Mapped["Comment | None"] = relationship(
remote_side="Comment.id", foreign_keys=[parent_comment_id]
)

View File

@@ -0,0 +1,22 @@
from datetime import date, datetime, timezone
from sqlalchemy import Date, DateTime, ForeignKey, JSON
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class DailyDigest(Base):
__tablename__ = "daily_digests"
id: Mapped[int] = mapped_column(primary_key=True)
subreddit_id: Mapped[int] = mapped_column(
ForeignKey("monitored_subreddits.id"), nullable=False
)
digest_date: Mapped[date] = mapped_column(Date, nullable=False)
content: Mapped[str] = mapped_column(nullable=False)
metadata_: Mapped[dict | None] = mapped_column("metadata", JSON)
generated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
subreddit: Mapped["MonitoredSubreddit"] = relationship(back_populates="daily_digests") # noqa: F821

View File

@@ -0,0 +1,23 @@
from datetime import datetime, timezone
from sqlalchemy import Integer, Float, DateTime, ForeignKey, Index
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class MetricSnapshot(Base):
__tablename__ = "metric_snapshots"
__table_args__ = (
Index("ix_metric_snapshots_post_snapshot", "post_id", "snapshot_at"),
)
id: Mapped[int] = mapped_column(primary_key=True)
post_id: Mapped[int] = mapped_column(ForeignKey("posts.id"), nullable=False)
score: Mapped[int] = mapped_column(Integer, nullable=False)
num_comments: Mapped[int] = mapped_column(Integer, nullable=False)
upvote_ratio: Mapped[float | None] = mapped_column(Float)
snapshot_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
post: Mapped["Post"] = relationship(back_populates="metric_snapshots") # noqa: F821

42
backend/models/post.py Normal file
View File

@@ -0,0 +1,42 @@
from datetime import datetime, timezone
from sqlalchemy import String, Boolean, Integer, Float, DateTime, ForeignKey, Index
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class Post(Base):
__tablename__ = "posts"
__table_args__ = (
Index("ix_posts_subreddit_created", "subreddit_id", "created_utc"),
)
id: Mapped[int] = mapped_column(primary_key=True)
reddit_id: Mapped[str] = mapped_column(String(20), unique=True, nullable=False)
subreddit_id: Mapped[int] = mapped_column(ForeignKey("monitored_subreddits.id"), index=True)
author_id: Mapped[int | None] = mapped_column(ForeignKey("authors.id"), index=True)
title: Mapped[str] = mapped_column(nullable=False)
selftext: Mapped[str | None] = mapped_column()
url: Mapped[str | None] = mapped_column()
permalink: Mapped[str | None] = mapped_column()
flair: Mapped[str | None] = mapped_column(String(255))
score: Mapped[int] = mapped_column(Integer, default=0, index=True)
upvote_ratio: Mapped[float | None] = mapped_column(Float)
num_comments: Mapped[int] = mapped_column(Integer, default=0)
is_self: Mapped[bool | None] = mapped_column(Boolean)
over_18: Mapped[bool] = mapped_column(Boolean, default=False)
hot_rank: Mapped[int | None] = mapped_column(Integer)
created_utc: Mapped[datetime] = mapped_column(DateTime(timezone=True), nullable=False)
collected_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=lambda: datetime.now(timezone.utc),
onupdate=lambda: datetime.now(timezone.utc),
)
subreddit: Mapped["MonitoredSubreddit"] = relationship(back_populates="posts") # noqa: F821
author: Mapped["Author | None"] = relationship(back_populates="posts") # noqa: F821
comments: Mapped[list["Comment"]] = relationship(back_populates="post") # noqa: F821
metric_snapshots: Mapped[list["MetricSnapshot"]] = relationship(back_populates="post") # noqa: F821

View File

@@ -0,0 +1,28 @@
from datetime import datetime, timezone
from sqlalchemy import String, Boolean, Integer, DateTime
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class MonitoredSubreddit(Base):
__tablename__ = "monitored_subreddits"
id: Mapped[int] = mapped_column(primary_key=True)
name: Mapped[str] = mapped_column(String(255), unique=True, nullable=False)
display_name: Mapped[str | None] = mapped_column(String(255))
description: Mapped[str | None] = mapped_column()
subscribers: Mapped[int | None] = mapped_column(Integer)
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
created_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
updated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True),
default=lambda: datetime.now(timezone.utc),
onupdate=lambda: datetime.now(timezone.utc),
)
posts: Mapped[list["Post"]] = relationship(back_populates="subreddit") # noqa: F821
daily_digests: Mapped[list["DailyDigest"]] = relationship(back_populates="subreddit") # noqa: F821
summaries: Mapped[list["Summary"]] = relationship(back_populates="subreddit") # noqa: F821

25
backend/models/summary.py Normal file
View File

@@ -0,0 +1,25 @@
from datetime import datetime, timezone
from sqlalchemy import String, DateTime, ForeignKey, JSON
from sqlalchemy.orm import Mapped, mapped_column, relationship
from backend.models.base import Base
class Summary(Base):
__tablename__ = "summaries"
id: Mapped[int] = mapped_column(primary_key=True)
subreddit_id: Mapped[int] = mapped_column(
ForeignKey("monitored_subreddits.id"), nullable=False
)
summary_type: Mapped[str] = mapped_column(String(50), nullable=False)
content: Mapped[str | None] = mapped_column()
metadata_: Mapped[dict | None] = mapped_column("metadata", JSON)
period_start: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
period_end: Mapped[datetime | None] = mapped_column(DateTime(timezone=True))
provider: Mapped[str | None] = mapped_column(String(100))
generated_at: Mapped[datetime] = mapped_column(
DateTime(timezone=True), default=lambda: datetime.now(timezone.utc)
)
subreddit: Mapped["MonitoredSubreddit"] = relationship(back_populates="summaries") # noqa: F821

View File

View File

@@ -0,0 +1,61 @@
from datetime import datetime
from fastapi import APIRouter, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.services import analytics_service
router = APIRouter(prefix="/analytics", tags=["analytics"])
@router.get("/engagement")
async def engagement(
subreddit_id: int | None = None,
granularity: str = Query("day", pattern="^(hour|day|week)$"),
since: datetime | None = None,
until: datetime | None = None,
db: AsyncSession = Depends(get_db),
):
return await analytics_service.get_engagement(db, subreddit_id, granularity, since, until)
@router.get("/top-posts")
async def top_posts(
subreddit_id: int | None = None,
metric: str = Query("score", pattern="^(score|num_comments)$"),
since: datetime | None = None,
until: datetime | None = None,
limit: int = Query(10, ge=1, le=50),
db: AsyncSession = Depends(get_db),
):
return await analytics_service.get_top_posts(db, subreddit_id, metric, since, until, limit)
@router.get("/top-authors")
async def top_authors(
subreddit_id: int | None = None,
since: datetime | None = None,
until: datetime | None = None,
limit: int = Query(10, ge=1, le=50),
db: AsyncSession = Depends(get_db),
):
return await analytics_service.get_top_authors(db, subreddit_id, since, until, limit)
@router.get("/subreddit-summary")
async def subreddit_summary(
since: datetime | None = None,
until: datetime | None = None,
db: AsyncSession = Depends(get_db),
):
return await analytics_service.get_subreddit_summary(db, since, until)
@router.get("/flair-distribution")
async def flair_distribution(
subreddit_id: int = Query(...),
since: datetime | None = None,
until: datetime | None = None,
db: AsyncSession = Depends(get_db),
):
return await analytics_service.get_flair_distribution(db, subreddit_id, since, until)

View File

@@ -0,0 +1,39 @@
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.services import author_service
router = APIRouter(prefix="/authors", tags=["authors"])
@router.get("")
async def list_authors(
subreddit_id: int | None = None,
sort_by: str = Query("total_comments", pattern="^(total_posts|total_comments)$"),
sort_order: str = Query("desc", pattern="^(asc|desc)$"),
since: datetime | None = None,
until: datetime | None = None,
page: int = Query(1, ge=1),
per_page: int = Query(25, ge=1, le=100),
db: AsyncSession = Depends(get_db),
):
authors, total = await author_service.list_authors(
db, subreddit_id, sort_by, sort_order, since, until, page, per_page
)
return {
"data": authors,
"total": total,
"page": page,
"per_page": per_page,
"pages": (total + per_page - 1) // per_page if per_page else 0,
}
@router.get("/{author_id}")
async def get_author(author_id: int, db: AsyncSession = Depends(get_db)):
author = await author_service.get_author(db, author_id)
if not author:
raise HTTPException(status_code=404, detail="Author not found")
return author

View File

@@ -0,0 +1,33 @@
from datetime import datetime
from fastapi import APIRouter, Depends, Query
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.services import comment_service
router = APIRouter(prefix="/comments", tags=["comments"])
@router.get("")
async def list_comments(
post_id: int | None = None,
subreddit_id: int | None = None,
author: str | None = None,
sort_by: str = Query("created_utc", pattern="^(created_utc|score)$"),
sort_order: str = Query("desc", pattern="^(asc|desc)$"),
since: datetime | None = None,
until: datetime | None = None,
page: int = Query(1, ge=1),
per_page: int = Query(25, ge=1, le=100),
db: AsyncSession = Depends(get_db),
):
comments, total = await comment_service.list_comments(
db, post_id, subreddit_id, author, sort_by, sort_order, since, until, page, per_page
)
return {
"data": comments,
"total": total,
"page": page,
"per_page": per_page,
"pages": (total + per_page - 1) // per_page if per_page else 0,
}

View File

@@ -0,0 +1,54 @@
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.models.daily_digest import DailyDigest
from backend.models.subreddit import MonitoredSubreddit
router = APIRouter(prefix="/digests", tags=["digests"])
@router.get("")
async def list_digests(
subreddit_id: int | None = None,
page: int = Query(1, ge=1),
per_page: int = Query(25, ge=1, le=100),
db: AsyncSession = Depends(get_db),
):
stmt = (
select(DailyDigest, MonitoredSubreddit.name)
.join(MonitoredSubreddit)
.order_by(DailyDigest.digest_date.desc())
)
if subreddit_id:
stmt = stmt.where(DailyDigest.subreddit_id == subreddit_id)
stmt = stmt.offset((page - 1) * per_page).limit(per_page)
result = await db.execute(stmt)
digests = []
for digest, sub_name in result.all():
data = {c.name: getattr(digest, c.name) for c in digest.__table__.columns}
data["subreddit_name"] = sub_name
digests.append(data)
return {"data": digests, "page": page, "per_page": per_page}
@router.get("/{digest_id}")
async def get_digest(digest_id: int, db: AsyncSession = Depends(get_db)):
stmt = (
select(DailyDigest, MonitoredSubreddit.name)
.join(MonitoredSubreddit)
.where(DailyDigest.id == digest_id)
)
result = await db.execute(stmt)
row = result.first()
if not row:
raise HTTPException(status_code=404, detail="Digest not found")
digest, sub_name = row
data = {c.name: getattr(digest, c.name) for c in digest.__table__.columns}
data["subreddit_name"] = sub_name
return data

16
backend/routers/health.py Normal file
View File

@@ -0,0 +1,16 @@
from fastapi import APIRouter, Depends
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
router = APIRouter()
@router.get("/health")
async def health_check(db: AsyncSession = Depends(get_db)):
try:
await db.execute(text("SELECT 1"))
return {"status": "ok", "db": "connected"}
except Exception:
return {"status": "degraded", "db": "disconnected"}

64
backend/routers/posts.py Normal file
View File

@@ -0,0 +1,64 @@
from datetime import datetime
from fastapi import APIRouter, Depends, HTTPException, Query
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.schemas.post import PostResponse, PostDetailResponse
from backend.services import post_service
from backend.models.metric_snapshot import MetricSnapshot
from sqlalchemy import select
router = APIRouter(prefix="/posts", tags=["posts"])
@router.get("")
async def list_posts(
subreddit_id: int | None = None,
author: str | None = None,
flair: str | None = None,
sort_by: str = Query("created_utc", pattern="^(created_utc|score|num_comments)$"),
sort_order: str = Query("desc", pattern="^(asc|desc)$"),
since: datetime | None = None,
until: datetime | None = None,
page: int = Query(1, ge=1),
per_page: int = Query(25, ge=1, le=100),
db: AsyncSession = Depends(get_db),
):
posts, total = await post_service.list_posts(
db, subreddit_id, author, flair, sort_by, sort_order, since, until, page, per_page
)
return {
"data": posts,
"total": total,
"page": page,
"per_page": per_page,
"pages": (total + per_page - 1) // per_page if per_page else 0,
}
@router.get("/{post_id}")
async def get_post(post_id: int, db: AsyncSession = Depends(get_db)):
post = await post_service.get_post(db, post_id)
if not post:
raise HTTPException(status_code=404, detail="Post not found")
return post
@router.get("/{post_id}/snapshots")
async def get_post_snapshots(post_id: int, db: AsyncSession = Depends(get_db)):
stmt = (
select(MetricSnapshot)
.where(MetricSnapshot.post_id == post_id)
.order_by(MetricSnapshot.snapshot_at.asc())
)
result = await db.execute(stmt)
snapshots = result.scalars().all()
return [
{
"score": s.score,
"num_comments": s.num_comments,
"upvote_ratio": s.upvote_ratio,
"snapshot_at": s.snapshot_at,
}
for s in snapshots
]

View File

@@ -0,0 +1,47 @@
from fastapi import APIRouter, Depends, HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from backend.database import get_db
from backend.schemas.subreddit import SubredditCreate, SubredditUpdate, SubredditResponse
from backend.services import subreddit_service
router = APIRouter(prefix="/subreddits", tags=["subreddits"])
@router.get("", response_model=list[SubredditResponse])
async def list_subreddits(db: AsyncSession = Depends(get_db)):
return await subreddit_service.list_subreddits(db)
@router.post("", response_model=SubredditResponse, status_code=201)
async def create_subreddit(body: SubredditCreate, db: AsyncSession = Depends(get_db)):
sub = await subreddit_service.create_subreddit(db, body.name)
data = {c.name: getattr(sub, c.name) for c in sub.__table__.columns}
data["post_count"] = 0
return data
@router.get("/{subreddit_id}", response_model=SubredditResponse)
async def get_subreddit(subreddit_id: int, db: AsyncSession = Depends(get_db)):
sub = await subreddit_service.get_subreddit(db, subreddit_id)
if not sub:
raise HTTPException(status_code=404, detail="Subreddit not found")
return sub
@router.patch("/{subreddit_id}", response_model=SubredditResponse)
async def update_subreddit(
subreddit_id: int, body: SubredditUpdate, db: AsyncSession = Depends(get_db)
):
sub = await subreddit_service.update_subreddit(db, subreddit_id, body.is_active)
if not sub:
raise HTTPException(status_code=404, detail="Subreddit not found")
result = await subreddit_service.get_subreddit(db, subreddit_id)
return result
@router.delete("/{subreddit_id}", status_code=204)
async def delete_subreddit(subreddit_id: int, db: AsyncSession = Depends(get_db)):
deleted = await subreddit_service.delete_subreddit(db, subreddit_id)
if not deleted:
raise HTTPException(status_code=404, detail="Subreddit not found")

View File

@@ -0,0 +1,13 @@
from fastapi import APIRouter
router = APIRouter(prefix="/summaries", tags=["summaries"])
@router.get("")
async def list_summaries():
return {"data": [], "message": "AI summaries not yet configured"}
@router.get("/{summary_id}")
async def get_summary(summary_id: int):
return {"detail": "AI summaries not yet configured"}

View File

View File

@@ -0,0 +1,54 @@
from datetime import datetime, date
from pydantic import BaseModel
class EngagementPoint(BaseModel):
period: str
posts: int
comments: int
avg_score: float
class TopPost(BaseModel):
id: int
title: str
score: int
num_comments: int
author_name: str | None
subreddit_name: str
created_utc: datetime
permalink: str | None
class TopAuthor(BaseModel):
id: int
username: str
post_count: int
comment_count: int
total_activity: int
class SubredditSummary(BaseModel):
subreddit_id: int
subreddit_name: str
total_posts: int
total_comments: int
avg_score: float
top_flair: str | None
class FlairCount(BaseModel):
flair: str | None
count: int
class DigestResponse(BaseModel):
id: int
subreddit_id: int
subreddit_name: str | None = None
digest_date: date
content: str
metadata_: dict | None = None
generated_at: datetime
model_config = {"from_attributes": True}

13
backend/schemas/author.py Normal file
View File

@@ -0,0 +1,13 @@
from datetime import datetime
from pydantic import BaseModel
class AuthorResponse(BaseModel):
id: int
username: str
first_seen_at: datetime
last_seen_at: datetime
total_posts: int
total_comments: int
model_config = {"from_attributes": True}

19
backend/schemas/common.py Normal file
View File

@@ -0,0 +1,19 @@
from datetime import datetime
from pydantic import BaseModel
class PaginationParams(BaseModel):
page: int = 1
per_page: int = 25
class PaginatedResponse(BaseModel):
total: int
page: int
per_page: int
pages: int
class TimeRangeParams(BaseModel):
since: datetime | None = None
until: datetime | None = None

45
backend/schemas/post.py Normal file
View File

@@ -0,0 +1,45 @@
from datetime import datetime
from pydantic import BaseModel
class PostResponse(BaseModel):
id: int
reddit_id: str
subreddit_id: int
subreddit_name: str | None = None
author_id: int | None
author_name: str | None = None
title: str
selftext: str | None
url: str | None
permalink: str | None
flair: str | None
score: int
upvote_ratio: float | None
num_comments: int
is_self: bool | None
over_18: bool
hot_rank: int | None
created_utc: datetime
collected_at: datetime
updated_at: datetime
model_config = {"from_attributes": True}
class PostDetailResponse(PostResponse):
comments: list["CommentResponse"] = []
class CommentResponse(BaseModel):
id: int
reddit_id: str
post_id: int
parent_comment_id: int | None
author_id: int | None
author_name: str | None = None
body: str
score: int
created_utc: datetime
model_config = {"from_attributes": True}

View File

@@ -0,0 +1,24 @@
from datetime import datetime
from pydantic import BaseModel
class SubredditCreate(BaseModel):
name: str
class SubredditUpdate(BaseModel):
is_active: bool | None = None
class SubredditResponse(BaseModel):
id: int
name: str
display_name: str | None
description: str | None
subscribers: int | None
is_active: bool
created_at: datetime
updated_at: datetime
post_count: int = 0
model_config = {"from_attributes": True}

View File

View File

@@ -0,0 +1,231 @@
from datetime import datetime, timedelta, timezone
from sqlalchemy import select, func, case, text
from sqlalchemy.ext.asyncio import AsyncSession
from backend.models.post import Post
from backend.models.comment import Comment
from backend.models.author import Author
from backend.models.subreddit import MonitoredSubreddit
async def get_engagement(
db: AsyncSession,
subreddit_id: int | None = None,
granularity: str = "day",
since: datetime | None = None,
until: datetime | None = None,
) -> list[dict]:
if not since:
since = datetime.now(timezone.utc) - timedelta(days=30)
if not until:
until = datetime.now(timezone.utc)
trunc = func.date_trunc(granularity, Post.created_utc)
stmt = select(
trunc.label("period"),
func.count(Post.id).label("posts"),
func.coalesce(func.avg(Post.score), 0).label("avg_score"),
).where(Post.created_utc >= since, Post.created_utc <= until)
if subreddit_id:
stmt = stmt.where(Post.subreddit_id == subreddit_id)
stmt = stmt.group_by("period").order_by("period")
result = await db.execute(stmt)
# Get comment counts per period
comment_trunc = func.date_trunc(granularity, Comment.created_utc)
comment_stmt = (
select(
comment_trunc.label("period"),
func.count(Comment.id).label("comments"),
)
.join(Post)
.where(Comment.created_utc >= since, Comment.created_utc <= until)
)
if subreddit_id:
comment_stmt = comment_stmt.where(Post.subreddit_id == subreddit_id)
comment_stmt = comment_stmt.group_by("period")
comment_result = await db.execute(comment_stmt)
comment_map = {str(r.period): r.comments for r in comment_result}
return [
{
"period": str(r.period),
"posts": r.posts,
"comments": comment_map.get(str(r.period), 0),
"avg_score": round(float(r.avg_score), 1),
}
for r in result
]
async def get_top_posts(
db: AsyncSession,
subreddit_id: int | None = None,
metric: str = "score",
since: datetime | None = None,
until: datetime | None = None,
limit: int = 10,
) -> list[dict]:
if not since:
since = datetime.now(timezone.utc) - timedelta(days=7)
stmt = (
select(Post, MonitoredSubreddit.name, Author.username)
.join(MonitoredSubreddit)
.outerjoin(Author)
.where(Post.created_utc >= since)
)
if until:
stmt = stmt.where(Post.created_utc <= until)
if subreddit_id:
stmt = stmt.where(Post.subreddit_id == subreddit_id)
sort_col = Post.score if metric == "score" else Post.num_comments
stmt = stmt.order_by(sort_col.desc()).limit(limit)
result = await db.execute(stmt)
return [
{
"id": post.id,
"title": post.title,
"score": post.score,
"num_comments": post.num_comments,
"author_name": author_name,
"subreddit_name": sub_name,
"created_utc": post.created_utc,
"permalink": post.permalink,
}
for post, sub_name, author_name in result.all()
]
async def get_top_authors(
db: AsyncSession,
subreddit_id: int | None = None,
since: datetime | None = None,
until: datetime | None = None,
limit: int = 10,
) -> list[dict]:
if not since:
since = datetime.now(timezone.utc) - timedelta(days=7)
post_count = (
select(func.count(Post.id))
.where(Post.author_id == Author.id, Post.created_utc >= since)
)
comment_count = (
select(func.count(Comment.id))
.where(Comment.author_id == Author.id, Comment.created_utc >= since)
)
if until:
post_count = post_count.where(Post.created_utc <= until)
comment_count = comment_count.where(Comment.created_utc <= until)
if subreddit_id:
post_count = post_count.where(Post.subreddit_id == subreddit_id)
comment_count = comment_count.join(Post).where(Post.subreddit_id == subreddit_id)
pc = post_count.correlate(Author).scalar_subquery().label("post_count")
cc = comment_count.correlate(Author).scalar_subquery().label("comment_count")
stmt = (
select(Author, pc, cc)
.order_by((pc + cc).desc())
.limit(limit)
)
result = await db.execute(stmt)
return [
{
"id": author.id,
"username": author.username,
"post_count": pc or 0,
"comment_count": cc or 0,
"total_activity": (pc or 0) + (cc or 0),
}
for author, pc, cc in result.all()
]
async def get_subreddit_summary(
db: AsyncSession,
since: datetime | None = None,
until: datetime | None = None,
) -> list[dict]:
if not since:
since = datetime.now(timezone.utc) - timedelta(days=7)
stmt = (
select(
MonitoredSubreddit.id,
MonitoredSubreddit.name,
func.count(Post.id).label("total_posts"),
func.coalesce(func.avg(Post.score), 0).label("avg_score"),
)
.outerjoin(Post, (Post.subreddit_id == MonitoredSubreddit.id) & (Post.created_utc >= since))
.where(MonitoredSubreddit.is_active == True) # noqa: E712
.group_by(MonitoredSubreddit.id)
.order_by(MonitoredSubreddit.name)
)
if until:
stmt = stmt.where(Post.created_utc <= until)
result = await db.execute(stmt)
summaries = []
for sub_id, sub_name, total_posts, avg_score in result.all():
# Get comment count
cc = await db.execute(
select(func.count(Comment.id))
.join(Post)
.where(Post.subreddit_id == sub_id, Comment.created_utc >= since)
)
comment_count = cc.scalar() or 0
# Top flair
flair_stmt = (
select(Post.flair, func.count(Post.id).label("cnt"))
.where(Post.subreddit_id == sub_id, Post.created_utc >= since, Post.flair.isnot(None))
.group_by(Post.flair)
.order_by(func.count(Post.id).desc())
.limit(1)
)
flair_result = await db.execute(flair_stmt)
top_flair_row = flair_result.first()
summaries.append({
"subreddit_id": sub_id,
"subreddit_name": sub_name,
"total_posts": total_posts,
"total_comments": comment_count,
"avg_score": round(float(avg_score), 1),
"top_flair": top_flair_row[0] if top_flair_row else None,
})
return summaries
async def get_flair_distribution(
db: AsyncSession,
subreddit_id: int,
since: datetime | None = None,
until: datetime | None = None,
) -> list[dict]:
if not since:
since = datetime.now(timezone.utc) - timedelta(days=30)
stmt = (
select(Post.flair, func.count(Post.id).label("count"))
.where(Post.subreddit_id == subreddit_id, Post.created_utc >= since)
.group_by(Post.flair)
.order_by(func.count(Post.id).desc())
)
if until:
stmt = stmt.where(Post.created_utc <= until)
result = await db.execute(stmt)
return [{"flair": flair, "count": count} for flair, count in result.all()]

View File

@@ -0,0 +1,79 @@
from datetime import datetime
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from backend.models.author import Author
from backend.models.post import Post
from backend.models.comment import Comment
async def list_authors(
db: AsyncSession,
subreddit_id: int | None = None,
sort_by: str = "total_comments",
sort_order: str = "desc",
since: datetime | None = None,
until: datetime | None = None,
page: int = 1,
per_page: int = 25,
) -> tuple[list[dict], int]:
base = select(Author)
if subreddit_id or since or until:
# Need to compute activity counts with filters
post_count = (
select(func.count(Post.id))
.where(Post.author_id == Author.id)
)
comment_count = (
select(func.count(Comment.id))
.where(Comment.author_id == Author.id)
)
if subreddit_id:
post_count = post_count.where(Post.subreddit_id == subreddit_id)
comment_count = comment_count.join(Post).where(Post.subreddit_id == subreddit_id)
if since:
post_count = post_count.where(Post.created_utc >= since)
comment_count = comment_count.where(Comment.created_utc >= since)
if until:
post_count = post_count.where(Post.created_utc <= until)
comment_count = comment_count.where(Comment.created_utc <= until)
base = select(
Author,
post_count.correlate(Author).scalar_subquery().label("filtered_posts"),
comment_count.correlate(Author).scalar_subquery().label("filtered_comments"),
)
else:
base = select(Author)
count_stmt = select(func.count()).select_from(base.subquery())
total = (await db.execute(count_stmt)).scalar() or 0
sort_col = getattr(Author, sort_by, Author.total_comments)
if sort_order == "asc":
base = base.order_by(sort_col.asc())
else:
base = base.order_by(sort_col.desc())
base = base.offset((page - 1) * per_page).limit(per_page)
result = await db.execute(base)
authors = []
for row in result.all():
if isinstance(row, tuple):
author = row[0]
else:
author = row
data = {c.name: getattr(author, c.name) for c in author.__table__.columns}
authors.append(data)
return authors, total
async def get_author(db: AsyncSession, author_id: int) -> dict | None:
author = await db.get(Author, author_id)
if not author:
return None
return {c.name: getattr(author, c.name) for c in author.__table__.columns}

View File

@@ -0,0 +1,57 @@
from datetime import datetime
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from backend.models.comment import Comment
from backend.models.post import Post
from backend.models.author import Author
async def list_comments(
db: AsyncSession,
post_id: int | None = None,
subreddit_id: int | None = None,
author: str | None = None,
sort_by: str = "created_utc",
sort_order: str = "desc",
since: datetime | None = None,
until: datetime | None = None,
page: int = 1,
per_page: int = 25,
) -> tuple[list[dict], int]:
base = select(Comment, Author.username).outerjoin(Author).join(Post)
filters = []
if post_id:
filters.append(Comment.post_id == post_id)
if subreddit_id:
filters.append(Post.subreddit_id == subreddit_id)
if author:
filters.append(Author.username == author)
if since:
filters.append(Comment.created_utc >= since)
if until:
filters.append(Comment.created_utc <= until)
if filters:
base = base.where(*filters)
count_stmt = select(func.count()).select_from(base.subquery())
total = (await db.execute(count_stmt)).scalar() or 0
sort_col = getattr(Comment, sort_by, Comment.created_utc)
if sort_order == "asc":
base = base.order_by(sort_col.asc())
else:
base = base.order_by(sort_col.desc())
base = base.offset((page - 1) * per_page).limit(per_page)
result = await db.execute(base)
comments = []
for comment, author_name in result.all():
data = {c.name: getattr(comment, c.name) for c in comment.__table__.columns}
data["author_name"] = author_name
comments.append(data)
return comments, total

View File

@@ -0,0 +1,102 @@
from datetime import datetime
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.orm import joinedload
from backend.models.post import Post
from backend.models.subreddit import MonitoredSubreddit
from backend.models.author import Author
from backend.models.comment import Comment
async def list_posts(
db: AsyncSession,
subreddit_id: int | None = None,
author: str | None = None,
flair: str | None = None,
sort_by: str = "created_utc",
sort_order: str = "desc",
since: datetime | None = None,
until: datetime | None = None,
page: int = 1,
per_page: int = 25,
) -> tuple[list[dict], int]:
base = select(Post, MonitoredSubreddit.name, Author.username).join(
MonitoredSubreddit
).outerjoin(Author)
filters = []
if subreddit_id:
filters.append(Post.subreddit_id == subreddit_id)
if flair:
filters.append(Post.flair == flair)
if since:
filters.append(Post.created_utc >= since)
if until:
filters.append(Post.created_utc <= until)
if author:
filters.append(Author.username == author)
if filters:
base = base.where(*filters)
# Count
count_stmt = select(func.count()).select_from(base.subquery())
total = (await db.execute(count_stmt)).scalar() or 0
# Sort
sort_col = getattr(Post, sort_by, Post.created_utc)
if sort_order == "asc":
base = base.order_by(sort_col.asc())
else:
base = base.order_by(sort_col.desc())
# Paginate
base = base.offset((page - 1) * per_page).limit(per_page)
result = await db.execute(base)
rows = result.all()
posts = []
for post, sub_name, author_name in rows:
data = {c.name: getattr(post, c.name) for c in post.__table__.columns}
data["subreddit_name"] = sub_name
data["author_name"] = author_name
posts.append(data)
return posts, total
async def get_post(db: AsyncSession, post_id: int) -> dict | None:
stmt = (
select(Post, MonitoredSubreddit.name, Author.username)
.join(MonitoredSubreddit)
.outerjoin(Author)
.where(Post.id == post_id)
)
result = await db.execute(stmt)
row = result.first()
if not row:
return None
post, sub_name, author_name = row
data = {c.name: getattr(post, c.name) for c in post.__table__.columns}
data["subreddit_name"] = sub_name
data["author_name"] = author_name
# Get comments
comment_stmt = (
select(Comment, Author.username)
.outerjoin(Author)
.where(Comment.post_id == post_id)
.order_by(Comment.created_utc.asc())
)
comment_result = await db.execute(comment_stmt)
comments = []
for comment, c_author in comment_result.all():
c_data = {c.name: getattr(comment, c.name) for c in comment.__table__.columns}
c_data["author_name"] = c_author
comments.append(c_data)
data["comments"] = comments
return data

View File

@@ -0,0 +1,75 @@
from sqlalchemy import select, func
from sqlalchemy.ext.asyncio import AsyncSession
from backend.models.subreddit import MonitoredSubreddit
from backend.models.post import Post
async def list_subreddits(db: AsyncSession) -> list[dict]:
stmt = (
select(
MonitoredSubreddit,
func.count(Post.id).label("post_count"),
)
.outerjoin(Post, Post.subreddit_id == MonitoredSubreddit.id)
.group_by(MonitoredSubreddit.id)
.order_by(MonitoredSubreddit.name)
)
result = await db.execute(stmt)
rows = result.all()
out = []
for sub, post_count in rows:
data = {c.name: getattr(sub, c.name) for c in sub.__table__.columns}
data["post_count"] = post_count
out.append(data)
return out
async def get_subreddit(db: AsyncSession, subreddit_id: int) -> dict | None:
stmt = (
select(
MonitoredSubreddit,
func.count(Post.id).label("post_count"),
)
.outerjoin(Post, Post.subreddit_id == MonitoredSubreddit.id)
.where(MonitoredSubreddit.id == subreddit_id)
.group_by(MonitoredSubreddit.id)
)
result = await db.execute(stmt)
row = result.first()
if not row:
return None
sub, post_count = row
data = {c.name: getattr(sub, c.name) for c in sub.__table__.columns}
data["post_count"] = post_count
return data
async def create_subreddit(db: AsyncSession, name: str) -> MonitoredSubreddit:
sub = MonitoredSubreddit(name=name.lower().strip())
db.add(sub)
await db.commit()
await db.refresh(sub)
return sub
async def update_subreddit(
db: AsyncSession, subreddit_id: int, is_active: bool | None = None
) -> MonitoredSubreddit | None:
sub = await db.get(MonitoredSubreddit, subreddit_id)
if not sub:
return None
if is_active is not None:
sub.is_active = is_active
await db.commit()
await db.refresh(sub)
return sub
async def delete_subreddit(db: AsyncSession, subreddit_id: int) -> bool:
sub = await db.get(MonitoredSubreddit, subreddit_id)
if not sub:
return False
sub.is_active = False
await db.commit()
return True

View File

View File

View File

@@ -0,0 +1,140 @@
import logging
from datetime import datetime, timezone, timedelta, date
from sqlalchemy import select, func, create_engine
from sqlalchemy.orm import sessionmaker
from backend.config import settings
from backend.models.subreddit import MonitoredSubreddit
from backend.models.post import Post
from backend.models.comment import Comment
from backend.models.author import Author
from backend.models.daily_digest import DailyDigest
logger = logging.getLogger(__name__)
_engine = create_engine(settings.database_url_sync, pool_size=2, pool_recycle=3600)
SyncSession = sessionmaker(_engine)
def generate_daily_digests():
"""Generate daily digest for each active subreddit."""
yesterday = date.today() - timedelta(days=1)
day_start = datetime(yesterday.year, yesterday.month, yesterday.day, tzinfo=timezone.utc)
day_end = day_start + timedelta(days=1)
with SyncSession() as db:
subs = db.execute(
select(MonitoredSubreddit).where(MonitoredSubreddit.is_active == True) # noqa: E712
).scalars().all()
for sub in subs:
# Check if digest already exists
existing = db.execute(
select(DailyDigest).where(
DailyDigest.subreddit_id == sub.id,
DailyDigest.digest_date == yesterday,
)
).scalar_one_or_none()
if existing:
continue
# Gather stats
post_count = db.execute(
select(func.count(Post.id)).where(
Post.subreddit_id == sub.id,
Post.created_utc >= day_start,
Post.created_utc < day_end,
)
).scalar() or 0
comment_count = db.execute(
select(func.count(Comment.id))
.join(Post)
.where(
Post.subreddit_id == sub.id,
Comment.created_utc >= day_start,
Comment.created_utc < day_end,
)
).scalar() or 0
# Top posts by score
top_posts = db.execute(
select(Post.title, Post.score, Post.num_comments, Post.permalink)
.where(
Post.subreddit_id == sub.id,
Post.created_utc >= day_start,
Post.created_utc < day_end,
)
.order_by(Post.score.desc())
.limit(5)
).all()
# Top authors
top_authors = db.execute(
select(Author.username, func.count(Comment.id).label("cnt"))
.join(Comment, Comment.author_id == Author.id)
.join(Post, Comment.post_id == Post.id)
.where(
Post.subreddit_id == sub.id,
Comment.created_utc >= day_start,
Comment.created_utc < day_end,
)
.group_by(Author.username)
.order_by(func.count(Comment.id).desc())
.limit(5)
).all()
avg_score = db.execute(
select(func.avg(Post.score)).where(
Post.subreddit_id == sub.id,
Post.created_utc >= day_start,
Post.created_utc < day_end,
)
).scalar()
# Build markdown digest
lines = [
f"# r/{sub.name} — Daily Digest for {yesterday}",
"",
f"**Posts:** {post_count} | **Comments:** {comment_count} | **Avg Score:** {avg_score:.1f}" if avg_score else f"**Posts:** {post_count} | **Comments:** {comment_count}",
"",
]
if top_posts:
lines.append("## Top Posts")
for i, (title, score, num_comments, permalink) in enumerate(top_posts, 1):
lines.append(f"{i}. **{title}** — {score} pts, {num_comments} comments")
lines.append("")
if top_authors:
lines.append("## Most Active Users")
for username, cnt in top_authors:
lines.append(f"- u/{username}: {cnt} comments")
lines.append("")
content = "\n".join(lines)
metadata = {
"post_count": post_count,
"comment_count": comment_count,
"avg_score": float(avg_score) if avg_score else 0,
"top_posts": [
{"title": t, "score": s, "num_comments": n}
for t, s, n, _ in top_posts
],
"top_authors": [
{"username": u, "comment_count": c}
for u, c in top_authors
],
}
digest = DailyDigest(
subreddit_id=sub.id,
digest_date=yesterday,
content=content,
metadata_=metadata,
)
db.add(digest)
db.commit()
logger.info(f"Generated daily digest for r/{sub.name} on {yesterday}")

90
backend/worker/main.py Normal file
View File

@@ -0,0 +1,90 @@
import logging
import signal
import sys
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.interval import IntervalTrigger
from apscheduler.triggers.cron import CronTrigger
from backend.config import settings
from backend.worker.monitor import poll_new_posts, poll_hot_posts, collect_comments, update_scores
from backend.worker.snapshot import take_metric_snapshots
from backend.worker.digest_job import generate_daily_digests
from backend.worker.summary_job import generate_summaries
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(name)s] %(levelname)s: %(message)s",
)
logger = logging.getLogger(__name__)
def seed_subreddits():
"""Add seed subreddits on first startup if configured."""
if not settings.seed_subreddits:
return
from sqlalchemy import select, create_engine
from sqlalchemy.orm import sessionmaker
from backend.models.subreddit import MonitoredSubreddit
engine = create_engine(settings.database_url_sync)
Session = sessionmaker(engine)
names = [s.strip().lower() for s in settings.seed_subreddits.split(",") if s.strip()]
with Session() as db:
for name in names:
existing = db.execute(
select(MonitoredSubreddit).where(MonitoredSubreddit.name == name)
).scalar_one_or_none()
if not existing:
db.add(MonitoredSubreddit(name=name))
logger.info(f"Seeded subreddit: r/{name}")
db.commit()
engine.dispose()
def main():
logger.info("Starting Reddit monitor worker")
seed_subreddits()
scheduler = BlockingScheduler()
# Reddit polling jobs
scheduler.add_job(poll_new_posts, IntervalTrigger(minutes=2), id="poll_new", max_instances=1)
scheduler.add_job(poll_hot_posts, IntervalTrigger(minutes=2), id="poll_hot", max_instances=1)
scheduler.add_job(collect_comments, IntervalTrigger(minutes=5), id="comments", max_instances=1)
scheduler.add_job(update_scores, IntervalTrigger(minutes=15), id="scores", max_instances=1)
# Metric snapshots
scheduler.add_job(take_metric_snapshots, IntervalTrigger(minutes=30), id="snapshots", max_instances=1)
# Daily digest
scheduler.add_job(
generate_daily_digests,
CronTrigger(hour=settings.digest_hour_utc, minute=0),
id="digest",
max_instances=1,
)
# AI summary stub
scheduler.add_job(
generate_summaries,
CronTrigger(hour=settings.digest_hour_utc, minute=30),
id="summary",
max_instances=1,
)
def shutdown(signum, frame):
logger.info("Shutting down worker...")
scheduler.shutdown(wait=False)
sys.exit(0)
signal.signal(signal.SIGTERM, shutdown)
signal.signal(signal.SIGINT, shutdown)
logger.info("Worker started. Scheduled jobs are running.")
scheduler.start()
if __name__ == "__main__":
main()

295
backend/worker/monitor.py Normal file
View File

@@ -0,0 +1,295 @@
import logging
from datetime import datetime, timezone, timedelta
from sqlalchemy import select, create_engine
from sqlalchemy.orm import Session, sessionmaker
from sqlalchemy.dialects.postgresql import insert
from backend.config import settings
from backend.models.subreddit import MonitoredSubreddit
from backend.models.author import Author
from backend.models.post import Post
from backend.models.comment import Comment
from backend.worker.reddit_client import create_client, fetch_json
logger = logging.getLogger(__name__)
# Sync engine for worker (PRAW-replacement uses async httpx, but DB writes are sync for simplicity with APScheduler)
_engine = create_engine(settings.database_url_sync, pool_size=3, max_overflow=5, pool_recycle=3600)
SyncSession = sessionmaker(_engine)
def _get_active_subreddits() -> list[dict]:
with SyncSession() as db:
stmt = select(MonitoredSubreddit).where(MonitoredSubreddit.is_active == True) # noqa: E712
result = db.execute(stmt)
return [{"id": s.id, "name": s.name} for s in result.scalars()]
def _upsert_author(db: Session, username: str) -> int | None:
if not username or username == "[deleted]":
return None
now = datetime.now(timezone.utc)
stmt = insert(Author).values(username=username, first_seen_at=now, last_seen_at=now)
stmt = stmt.on_conflict_do_update(
index_elements=[Author.username],
set_={"last_seen_at": now},
)
db.execute(stmt)
result = db.execute(select(Author.id).where(Author.username == username))
row = result.first()
return row[0] if row else None
def _parse_post(post_data: dict, subreddit_id: int, db: Session, hot_rank: int | None = None) -> dict:
data = post_data.get("data", post_data)
author_id = _upsert_author(db, data.get("author"))
created = datetime.fromtimestamp(data.get("created_utc", 0), tz=timezone.utc)
return {
"reddit_id": data.get("name", f"t3_{data.get('id', '')}"),
"subreddit_id": subreddit_id,
"author_id": author_id,
"title": data.get("title", ""),
"selftext": data.get("selftext"),
"url": data.get("url"),
"permalink": data.get("permalink"),
"flair": data.get("link_flair_text"),
"score": data.get("score", 0),
"upvote_ratio": data.get("upvote_ratio"),
"num_comments": data.get("num_comments", 0),
"is_self": data.get("is_self"),
"over_18": data.get("over_18", False),
"hot_rank": hot_rank,
"created_utc": created,
"collected_at": datetime.now(timezone.utc),
"updated_at": datetime.now(timezone.utc),
}
def _upsert_posts(db: Session, posts: list[dict], update_hot_rank: bool = False):
if not posts:
return
update_set = {
"score": insert(Post).excluded.score,
"upvote_ratio": insert(Post).excluded.upvote_ratio,
"num_comments": insert(Post).excluded.num_comments,
"updated_at": insert(Post).excluded.updated_at,
}
if update_hot_rank:
update_set["hot_rank"] = insert(Post).excluded.hot_rank
stmt = insert(Post).values(posts)
stmt = stmt.on_conflict_do_update(
index_elements=[Post.reddit_id],
set_=update_set,
)
db.execute(stmt)
def _parse_comment(comment_data: dict, post_id: int, db: Session, parent_map: dict) -> dict | None:
data = comment_data.get("data", comment_data)
if data.get("kind") == "more" or not data.get("body"):
return None
reddit_id = data.get("name", f"t1_{data.get('id', '')}")
author_id = _upsert_author(db, data.get("author"))
created = datetime.fromtimestamp(data.get("created_utc", 0), tz=timezone.utc)
parent_reddit_id = data.get("parent_id", "")
parent_comment_id = parent_map.get(parent_reddit_id)
return {
"reddit_id": reddit_id,
"post_id": post_id,
"parent_comment_id": parent_comment_id,
"author_id": author_id,
"body": data.get("body", ""),
"score": data.get("score", 0),
"created_utc": created,
"collected_at": datetime.now(timezone.utc),
"updated_at": datetime.now(timezone.utc),
}
import asyncio
def poll_new_posts():
"""Fetch /new for each active subreddit and upsert posts."""
asyncio.run(_poll_new_posts_async())
async def _poll_new_posts_async():
subreddits = _get_active_subreddits()
if not subreddits:
return
client = create_client()
async with client:
for sub in subreddits:
data = await fetch_json(client, f"/r/{sub['name']}/new", {"limit": "100"})
if not data:
continue
children = data.get("data", {}).get("children", [])
if not children:
continue
with SyncSession() as db:
posts = [_parse_post(child, sub["id"], db) for child in children]
_upsert_posts(db, posts)
db.commit()
logger.info(f"r/{sub['name']}: upserted {len(children)} new posts")
def poll_hot_posts():
"""Fetch /hot for each active subreddit and update hot_rank."""
asyncio.run(_poll_hot_posts_async())
async def _poll_hot_posts_async():
subreddits = _get_active_subreddits()
if not subreddits:
return
client = create_client()
async with client:
for sub in subreddits:
data = await fetch_json(client, f"/r/{sub['name']}/hot", {"limit": "100"})
if not data:
continue
children = data.get("data", {}).get("children", [])
if not children:
continue
with SyncSession() as db:
posts = [
_parse_post(child, sub["id"], db, hot_rank=i + 1)
for i, child in enumerate(children)
]
_upsert_posts(db, posts, update_hot_rank=True)
db.commit()
logger.info(f"r/{sub['name']}: updated hot ranks for {len(children)} posts")
def collect_comments():
"""Fetch comments for recent posts."""
asyncio.run(_collect_comments_async())
async def _collect_comments_async():
cutoff = datetime.now(timezone.utc) - timedelta(hours=48)
with SyncSession() as db:
stmt = (
select(Post.id, Post.reddit_id, Post.subreddit_id)
.join(MonitoredSubreddit)
.where(
MonitoredSubreddit.is_active == True, # noqa: E712
Post.created_utc >= cutoff,
)
.order_by(Post.created_utc.desc())
.limit(50)
)
result = db.execute(stmt)
recent_posts = [{"id": r[0], "reddit_id": r[1], "subreddit_id": r[2]} for r in result]
if not recent_posts:
return
client = create_client()
async with client:
for post in recent_posts:
short_id = post["reddit_id"].replace("t3_", "")
data = await fetch_json(client, f"/comments/{short_id}", {"limit": "500", "sort": "new"})
if not data or len(data) < 2:
continue
comment_listing = data[1].get("data", {}).get("children", [])
with SyncSession() as db:
# Build parent_map from existing comments
existing = db.execute(
select(Comment.id, Comment.reddit_id).where(Comment.post_id == post["id"])
)
parent_map = {r[1]: r[0] for r in existing}
comments_to_upsert = []
def process_comments(children):
for child in children:
if child.get("kind") == "more":
continue
c_data = child.get("data", {})
parsed = _parse_comment(c_data, post["id"], db, parent_map)
if parsed:
comments_to_upsert.append(parsed)
# Process replies recursively
replies = c_data.get("replies")
if isinstance(replies, dict):
reply_children = replies.get("data", {}).get("children", [])
process_comments(reply_children)
process_comments(comment_listing)
if comments_to_upsert:
# Upsert comments one at a time to handle parent references
for comment in comments_to_upsert:
stmt = insert(Comment).values(comment)
stmt = stmt.on_conflict_do_update(
index_elements=[Comment.reddit_id],
set_={
"score": stmt.excluded.score,
"body": stmt.excluded.body,
"updated_at": stmt.excluded.updated_at,
},
)
db.execute(stmt)
db.commit()
logger.info(f"Post {short_id}: upserted {len(comments_to_upsert)} comments")
def update_scores():
"""Re-fetch recent posts to update scores and comment counts."""
asyncio.run(_update_scores_async())
async def _update_scores_async():
cutoff = datetime.now(timezone.utc) - timedelta(days=7)
with SyncSession() as db:
stmt = (
select(Post.reddit_id, Post.subreddit_id, MonitoredSubreddit.name)
.join(MonitoredSubreddit)
.where(
MonitoredSubreddit.is_active == True, # noqa: E712
Post.created_utc >= cutoff,
)
)
result = db.execute(stmt)
posts_by_sub: dict[str, list[str]] = {}
for reddit_id, _, sub_name in result:
posts_by_sub.setdefault(sub_name, []).append(reddit_id)
if not posts_by_sub:
return
# Score updates piggyback on the new/hot polls — the upsert already updates scores.
# This job explicitly re-fetches to catch score changes on older posts.
client = create_client()
async with client:
for sub_name, reddit_ids in posts_by_sub.items():
data = await fetch_json(client, f"/r/{sub_name}/new", {"limit": "100"})
if not data:
continue
children = data.get("data", {}).get("children", [])
with SyncSession() as db:
sub = db.execute(
select(MonitoredSubreddit).where(MonitoredSubreddit.name == sub_name)
).scalar_one_or_none()
if not sub:
continue
posts = [_parse_post(child, sub.id, db) for child in children]
_upsert_posts(db, posts)
db.commit()
logger.info(f"Score update complete for {len(posts_by_sub)} subreddits")

View File

@@ -0,0 +1,58 @@
import asyncio
import logging
import time
import httpx
from backend.config import settings
logger = logging.getLogger(__name__)
BASE_URL = "https://www.reddit.com"
# Simple in-process rate limiter: track request timestamps
_request_times: list[float] = []
MAX_REQUESTS_PER_MINUTE = 9 # Stay under Reddit's ~10/min limit
async def _wait_for_rate_limit():
"""Block until we have budget for another request."""
now = time.monotonic()
# Remove timestamps older than 60 seconds
while _request_times and _request_times[0] < now - 60:
_request_times.pop(0)
if len(_request_times) >= MAX_REQUESTS_PER_MINUTE:
wait = 60 - (now - _request_times[0]) + 0.5
logger.info(f"Rate limit: waiting {wait:.1f}s")
await asyncio.sleep(wait)
_request_times.append(time.monotonic())
async def fetch_json(client: httpx.AsyncClient, path: str, params: dict | None = None) -> dict | None:
"""Fetch a Reddit .json endpoint with rate limiting and error handling."""
await _wait_for_rate_limit()
url = f"{BASE_URL}{path}.json"
try:
response = await client.get(url, params=params)
if response.status_code == 429:
retry_after = int(response.headers.get("Retry-After", 60))
logger.warning(f"Rate limited, waiting {retry_after}s")
await asyncio.sleep(retry_after)
return await fetch_json(client, path, params)
if response.status_code >= 500:
logger.warning(f"Reddit returned {response.status_code} for {path}")
return None
response.raise_for_status()
return response.json()
except httpx.HTTPError as e:
logger.error(f"HTTP error fetching {path}: {e}")
return None
def create_client() -> httpx.AsyncClient:
"""Create an httpx client configured for Reddit."""
return httpx.AsyncClient(
headers={"User-Agent": settings.reddit_user_agent},
timeout=30.0,
follow_redirects=True,
)

View File

@@ -0,0 +1,47 @@
import logging
from datetime import datetime, timezone, timedelta
from sqlalchemy import select, create_engine
from sqlalchemy.orm import sessionmaker
from backend.config import settings
from backend.models.post import Post
from backend.models.metric_snapshot import MetricSnapshot
from backend.models.subreddit import MonitoredSubreddit
logger = logging.getLogger(__name__)
_engine = create_engine(settings.database_url_sync, pool_size=2, pool_recycle=3600)
SyncSession = sessionmaker(_engine)
def take_metric_snapshots():
"""Snapshot current metrics for recent posts."""
now = datetime.now(timezone.utc)
with SyncSession() as db:
# Posts < 48h old: snapshot every run (every 30 min)
cutoff_recent = now - timedelta(hours=48)
stmt = (
select(Post.id, Post.score, Post.num_comments, Post.upvote_ratio)
.join(MonitoredSubreddit)
.where(
MonitoredSubreddit.is_active == True, # noqa: E712
Post.created_utc >= cutoff_recent,
)
)
result = db.execute(stmt)
snapshots = []
for post_id, score, num_comments, upvote_ratio in result:
snapshots.append(MetricSnapshot(
post_id=post_id,
score=score,
num_comments=num_comments,
upvote_ratio=upvote_ratio,
snapshot_at=now,
))
if snapshots:
db.add_all(snapshots)
db.commit()
logger.info(f"Took {len(snapshots)} metric snapshots")

View File

@@ -0,0 +1,12 @@
import logging
from backend.config import settings
logger = logging.getLogger(__name__)
def generate_summaries():
"""Stub: AI summary generation. Enable when a provider is configured."""
if not settings.ai_summary_enabled:
return
logger.info("AI summary generation not yet configured")