caching with sql

main
Brett 2025-06-26 20:44:51 -04:00
parent 48cc36011e
commit db391a6163
4 changed files with 189 additions and 7 deletions

View File

@ -8,7 +8,7 @@ from typing import Final, Optional, List
import discord
from dotenv import load_dotenv
import re
from pool import PlaywrightPool
from pool import PlaywrightPool, ArticleRepository
import trafilatura
import io
@ -25,12 +25,14 @@ intents.message_content = True
bot = discord.Client(intents=intents)
LOGGER = logging.getLogger("Newsulizer")
LOGGER = logging.getLogger("main")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
article_repository = ArticleRepository()
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
fp = io.BytesIO(content.encode("utf-8"))
file = discord.File(fp, filename=filename)
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
LOGGER.info("Received URL from %s: %s", message.author, url)
try:
html = await PlaywrightPool.fetch_html(url)
processed_html = await article_repository.get_article(url)
# TODO: parse `html`, summarise, etc.
await message.channel.send(f"✅ Article downloaded {len(html):,} bytes.")
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
await message.channel.send(f"✅ Article downloaded {len(processed_html):,} bytes.")
await send_text_file(message.channel, processed_html)
except:
LOGGER.exception("Playwright failed")
await message.channel.send("❌ Sorry, I couldn't fetch that page.")

BIN
news/newsulizer.sqlite3 Normal file

Binary file not shown.

View File

@ -1,7 +1,25 @@
from __future__ import annotations
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
from typing import Final, Optional
import asyncio
import os
import sqlite3
import trafilatura
import types
from typing import Final, Optional, Union, Protocol, Any, Tuple
import logging
def process_html(html):
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
include_tables=True, include_comments=False, favor_recall=True)
LOGGER = logging.getLogger("pool")
# logging.basicConfig(
# level=logging.INFO,
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
# )
class PlaywrightPool:
_pw = None # playwright instance
_browser: Optional[Browser] = None
@ -54,4 +72,166 @@ class PlaywrightPool:
html = await page.content()
return html
finally:
await page.close()
await page.close()
class DBConnectionInfo:
def __init__(
self,
dbname: str,
user: str,
password: str,
host: str = "localhost",
port: int = 5432,
) -> None:
self.host = host
self.port = port
self.dbname = dbname
self.user = user
self.password = password
class ArticleRepository:
"""
A very small wrapper around a database that maintains a single table
called 'articles' inside a database called 'newsulizer'.
If you pass an existing DB-API connection, it will be used as-is.
If you dont pass anything, a local SQLite file called
'./newsulizer.sqlite3' is created/used automatically.
"""
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
_TABLE_NAME = "articles"
def __init__(
self,
connection_info: Optional[DBConnectionInfo] = None,
sqlite_path: Optional[str] = None,
) -> None:
"""
Parameters
----------
sqlite_path:
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
when *connection* is omitted.
"""
if connection_info is None:
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
connection = self._make_sqlite_conn(sqlite_path)
self.cursor_type = "?"
else:
connection = self._make_postgres_conn(
host=connection_info.host,
port=connection_info.port,
dbname=connection_info.dbname,
user=connection_info.user,
password=connection_info.password,
)
self.cursor_type = "%s"
self._conn = connection
self._ensure_schema()
# Protect SQLite (which is not async-safe) by one lock
self._lock = asyncio.Lock()
# ------------------------------------------------------------------ #
# public API
# ------------------------------------------------------------------ #
async def get_article(self, url: str) -> str:
"""
Main entry point.
Returns the processed text if it is already cached.
Otherwise downloads it, processes it, stores it, and returns it.
"""
# Single writer at a time when using sqlite3 avoids `database is locked`
async with self._lock:
row = self._row_for_url(url)
if row and row[3]: # row = (id, url, raw, processed)
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
return row[3] # processed_html already present
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
raw_html = await PlaywrightPool.fetch_html(url)
processed_html = process_html(raw_html)
async with self._lock:
# Upsert:
self._conn.execute(
f"""
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
ON CONFLICT(url) DO UPDATE SET
raw_html=EXCLUDED.raw_html,
processed_html=EXCLUDED.processed_html
""",
(url, raw_html, processed_html),
)
self._conn.commit()
return processed_html
def close(self) -> None:
"""Close the underlying DB connection."""
try:
self._conn.close()
except Exception:
pass
# ------------------------------------------------------------------ #
# internals
# ------------------------------------------------------------------ #
def _ensure_schema(self) -> None:
"""Create the articles table if it does not yet exist."""
# Simple feature detection for DBs that do not support
# `ON CONFLICT` (mainly older MySQL) could be added here.
self._conn.execute(
f"""
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
raw_html TEXT NOT NULL,
processed_html TEXT NOT NULL
)
"""
)
self._conn.commit()
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
cur = self._conn.cursor()
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
return cur.fetchone()
@staticmethod
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
first_time = not os.path.exists(sqlite_path)
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
# Enforce basic integrity
connection.execute("PRAGMA foreign_keys = ON")
connection.execute("PRAGMA busy_timeout = 5000")
if first_time:
# Ensure a human-readable filename, not an unnamed ATTACH
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
else:
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
return connection
@staticmethod
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
try:
import psycopg2
except ModuleNotFoundError as exc:
raise RuntimeError(
"psycopg2 is required for PostgreSQL support "
"run `pip install psycopg2-binary`"
) from exc
conn = psycopg2.connect(
host=host, port=port, dbname=dbname, user=user, password=password
)
conn.autocommit = False
return conn