caching with sql

main
Brett 2025-06-26 20:44:51 -04:00
parent 48cc36011e
commit db391a6163
4 changed files with 189 additions and 7 deletions

View File

@ -8,7 +8,7 @@ from typing import Final, Optional, List
import discord import discord
from dotenv import load_dotenv from dotenv import load_dotenv
import re import re
from pool import PlaywrightPool from pool import PlaywrightPool, ArticleRepository
import trafilatura import trafilatura
import io import io
@ -25,12 +25,14 @@ intents.message_content = True
bot = discord.Client(intents=intents) bot = discord.Client(intents=intents)
LOGGER = logging.getLogger("Newsulizer") LOGGER = logging.getLogger("main")
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
) )
article_repository = ArticleRepository()
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None: async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
fp = io.BytesIO(content.encode("utf-8")) fp = io.BytesIO(content.encode("utf-8"))
file = discord.File(fp, filename=filename) file = discord.File(fp, filename=filename)
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
LOGGER.info("Received URL from %s: %s", message.author, url) LOGGER.info("Received URL from %s: %s", message.author, url)
try: try:
html = await PlaywrightPool.fetch_html(url) processed_html = await article_repository.get_article(url)
# TODO: parse `html`, summarise, etc. # TODO: parse `html`, summarise, etc.
await message.channel.send(f"✅ Article downloaded {len(html):,} bytes.") await message.channel.send(f"✅ Article downloaded {len(processed_html):,} bytes.")
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True)) await send_text_file(message.channel, processed_html)
except: except:
LOGGER.exception("Playwright failed") LOGGER.exception("Playwright failed")
await message.channel.send("❌ Sorry, I couldn't fetch that page.") await message.channel.send("❌ Sorry, I couldn't fetch that page.")

BIN
news/newsulizer.sqlite3 Normal file

Binary file not shown.

View File

@ -1,7 +1,25 @@
from __future__ import annotations
from playwright.async_api import async_playwright, Browser, BrowserContext, Page from playwright.async_api import async_playwright, Browser, BrowserContext, Page
from typing import Final, Optional
import asyncio import asyncio
import os
import sqlite3
import trafilatura
import types
from typing import Final, Optional, Union, Protocol, Any, Tuple
import logging
def process_html(html):
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
include_tables=True, include_comments=False, favor_recall=True)
LOGGER = logging.getLogger("pool")
# logging.basicConfig(
# level=logging.INFO,
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
# )
class PlaywrightPool: class PlaywrightPool:
_pw = None # playwright instance _pw = None # playwright instance
_browser: Optional[Browser] = None _browser: Optional[Browser] = None
@ -54,4 +72,166 @@ class PlaywrightPool:
html = await page.content() html = await page.content()
return html return html
finally: finally:
await page.close() await page.close()
class DBConnectionInfo:
def __init__(
self,
dbname: str,
user: str,
password: str,
host: str = "localhost",
port: int = 5432,
) -> None:
self.host = host
self.port = port
self.dbname = dbname
self.user = user
self.password = password
class ArticleRepository:
"""
A very small wrapper around a database that maintains a single table
called 'articles' inside a database called 'newsulizer'.
If you pass an existing DB-API connection, it will be used as-is.
If you dont pass anything, a local SQLite file called
'./newsulizer.sqlite3' is created/used automatically.
"""
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
_TABLE_NAME = "articles"
def __init__(
self,
connection_info: Optional[DBConnectionInfo] = None,
sqlite_path: Optional[str] = None,
) -> None:
"""
Parameters
----------
sqlite_path:
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
when *connection* is omitted.
"""
if connection_info is None:
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
connection = self._make_sqlite_conn(sqlite_path)
self.cursor_type = "?"
else:
connection = self._make_postgres_conn(
host=connection_info.host,
port=connection_info.port,
dbname=connection_info.dbname,
user=connection_info.user,
password=connection_info.password,
)
self.cursor_type = "%s"
self._conn = connection
self._ensure_schema()
# Protect SQLite (which is not async-safe) by one lock
self._lock = asyncio.Lock()
# ------------------------------------------------------------------ #
# public API
# ------------------------------------------------------------------ #
async def get_article(self, url: str) -> str:
"""
Main entry point.
Returns the processed text if it is already cached.
Otherwise downloads it, processes it, stores it, and returns it.
"""
# Single writer at a time when using sqlite3 avoids `database is locked`
async with self._lock:
row = self._row_for_url(url)
if row and row[3]: # row = (id, url, raw, processed)
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
return row[3] # processed_html already present
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
raw_html = await PlaywrightPool.fetch_html(url)
processed_html = process_html(raw_html)
async with self._lock:
# Upsert:
self._conn.execute(
f"""
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
ON CONFLICT(url) DO UPDATE SET
raw_html=EXCLUDED.raw_html,
processed_html=EXCLUDED.processed_html
""",
(url, raw_html, processed_html),
)
self._conn.commit()
return processed_html
def close(self) -> None:
"""Close the underlying DB connection."""
try:
self._conn.close()
except Exception:
pass
# ------------------------------------------------------------------ #
# internals
# ------------------------------------------------------------------ #
def _ensure_schema(self) -> None:
"""Create the articles table if it does not yet exist."""
# Simple feature detection for DBs that do not support
# `ON CONFLICT` (mainly older MySQL) could be added here.
self._conn.execute(
f"""
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
raw_html TEXT NOT NULL,
processed_html TEXT NOT NULL
)
"""
)
self._conn.commit()
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
cur = self._conn.cursor()
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
return cur.fetchone()
@staticmethod
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
first_time = not os.path.exists(sqlite_path)
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
# Enforce basic integrity
connection.execute("PRAGMA foreign_keys = ON")
connection.execute("PRAGMA busy_timeout = 5000")
if first_time:
# Ensure a human-readable filename, not an unnamed ATTACH
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
else:
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
return connection
@staticmethod
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
try:
import psycopg2
except ModuleNotFoundError as exc:
raise RuntimeError(
"psycopg2 is required for PostgreSQL support "
"run `pip install psycopg2-binary`"
) from exc
conn = psycopg2.connect(
host=host, port=port, dbname=dbname, user=user, password=password
)
conn.autocommit = False
return conn