caching with sql

2025-06-26 20:44:51 -04:00 · 2025-06-26 20:44:51 -04:00 · db391a6163
parent 48cc36011e
commit db391a6163
4 changed files with 189 additions and 7 deletions
--- a/news/pycache/pool.cpython-311.pyc
+++ b/news/pycache/pool.cpython-311.pyc
--- a/news/main.py
+++ b/news/main.py
@ -8,7 +8,7 @@ from typing import Final, Optional, List
 import discord
 from dotenv import load_dotenv
 import re
-from pool import PlaywrightPool
+from pool import PlaywrightPool, ArticleRepository
 import trafilatura
 import io
@ -25,12 +25,14 @@ intents.message_content = True
 bot = discord.Client(intents=intents)
-LOGGER = logging.getLogger("Newsulizer")
+LOGGER = logging.getLogger("main")
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
 )
 article_repository = ArticleRepository()
 async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
    fp = io.BytesIO(content.encode("utf-8"))
    file = discord.File(fp, filename=filename)
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
    LOGGER.info("Received URL from %s: %s", message.author, url)
    try:
-        html = await PlaywrightPool.fetch_html(url)
+        processed_html = await article_repository.get_article(url)
        # TODO: parse `html`, summarise, etc.
-        await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
+        await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
-        await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
+        await send_text_file(message.channel, processed_html)
    except:
        LOGGER.exception("Playwright failed")
        await message.channel.send("❌ Sorry, I couldn't fetch that page.")
--- a/news/newsulizer.sqlite3
+++ b/news/newsulizer.sqlite3
--- a/news/pool.py
+++ b/news/pool.py
@ -1,7 +1,25 @@
 from __future__ import annotations
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
 from typing import Final, Optional
 import asyncio
 import os
 import sqlite3
 import trafilatura
 import types
 from typing import Final, Optional, Union, Protocol, Any, Tuple
 import logging
 def process_html(html):
    return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
                        include_tables=True, include_comments=False, favor_recall=True)
 LOGGER = logging.getLogger("pool")
 # logging.basicConfig(
 #     level=logging.INFO,
 #     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
 # )
 class PlaywrightPool:
    _pw = None           # playwright instance
    _browser: Optional[Browser] = None
@ -54,4 +72,166 @@ class PlaywrightPool:
                html = await page.content()
                return html
            finally:
-                await page.close()
+                await page.close()
 class DBConnectionInfo:
    def __init__(
            self,
            dbname: str,
            user: str,
            password: str,
            host: str = "localhost",
            port: int = 5432,
    ) -> None:
        self.host = host
        self.port = port
        self.dbname = dbname
        self.user = user
        self.password = password
 class ArticleRepository:
    """
    A very small wrapper around a database that maintains a single table
    called 'articles' inside a database called 'newsulizer'.
    • If you pass an existing DB-API connection, it will be used as-is.
    • If you don’t pass anything, a local SQLite file called
      './newsulizer.sqlite3' is created/used automatically.
    """
    _CREATE_DB_SQLITE = "newsulizer.sqlite3"
    _TABLE_NAME = "articles"
    def __init__(
        self,
        connection_info: Optional[DBConnectionInfo] = None,
        sqlite_path: Optional[str] = None,
    ) -> None:
        """
        Parameters
        ----------
        sqlite_path:
            Path to an SQLite file.  Defaults to ./newsulizer.sqlite3
            when *connection* is omitted.
        """
        if connection_info is None:
            sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
            connection = self._make_sqlite_conn(sqlite_path)
            self.cursor_type = "?"
        else:
            connection = self._make_postgres_conn(
                host=connection_info.host,
                port=connection_info.port,
                dbname=connection_info.dbname,
                user=connection_info.user,
                password=connection_info.password,
            )
            self.cursor_type = "%s"
        self._conn = connection
        self._ensure_schema()
        # Protect SQLite (which is not async-safe) by one lock
        self._lock = asyncio.Lock()
    # ------------------------------------------------------------------ #
    # public API
    # ------------------------------------------------------------------ #
    async def get_article(self, url: str) -> str:
        """
        Main entry point.
        • Returns the processed text if it is already cached.
        • Otherwise downloads it, processes it, stores it, and returns it.
        """
        # Single writer at a time when using sqlite3 – avoids `database is locked`
        async with self._lock:
            row = self._row_for_url(url)
            if row and row[3]:                          # row = (id, url, raw, processed)
                LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
                return row[3]                           # processed_html already present
        LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
        raw_html = await PlaywrightPool.fetch_html(url)
        processed_html = process_html(raw_html)
        async with self._lock:
            # Upsert:
            self._conn.execute(
                f"""
                INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
                VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
                ON CONFLICT(url) DO UPDATE SET
                    raw_html=EXCLUDED.raw_html,
                    processed_html=EXCLUDED.processed_html
                """,
                (url, raw_html, processed_html),
            )
            self._conn.commit()
        return processed_html
    def close(self) -> None:
        """Close the underlying DB connection."""
        try:
            self._conn.close()
        except Exception:
            pass
    # ------------------------------------------------------------------ #
    # internals
    # ------------------------------------------------------------------ #
    def _ensure_schema(self) -> None:
        """Create the articles table if it does not yet exist."""
        # Simple feature detection for DBs that do not support
        # `ON CONFLICT` (mainly older MySQL) could be added here.
        self._conn.execute(
            f"""
            CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
                id             INTEGER PRIMARY KEY AUTOINCREMENT,
                url            TEXT UNIQUE NOT NULL,
                raw_html       TEXT NOT NULL,
                processed_html TEXT NOT NULL
            )
            """
        )
        self._conn.commit()
    def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
        cur = self._conn.cursor()
        cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
        return cur.fetchone()
    @staticmethod
    def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
        first_time = not os.path.exists(sqlite_path)
        connection = sqlite3.connect(sqlite_path, check_same_thread=False)
        # Enforce basic integrity
        connection.execute("PRAGMA foreign_keys = ON")
        connection.execute("PRAGMA busy_timeout = 5000")
        if first_time:
            # Ensure a human-readable filename, not an unnamed ATTACH
            LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
        else:
            LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
        return connection
    @staticmethod
    def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
        try:
            import psycopg2
        except ModuleNotFoundError as exc:
            raise RuntimeError(
                "psycopg2 is required for PostgreSQL support – "
                "run `pip install psycopg2-binary`"
            ) from exc
        conn = psycopg2.connect(
            host=host, port=port, dbname=dbname, user=user, password=password
        )
        conn.autocommit = False
        return conn