caching with sql

2025-06-26 20:44:51 -04:00 · 2025-06-26 20:44:51 -04:00 · db391a6163
parent 48cc36011e
commit db391a6163
4 changed files with 189 additions and 7 deletions
--- a/news/pycache/pool.cpython-311.pyc
+++ b/news/pycache/pool.cpython-311.pyc
--- a/news/main.py
+++ b/news/main.py
@ -8,7 +8,7 @@ from typing import Final, Optional, List
 import discord
 from dotenv import load_dotenv
 import re
-from pool import PlaywrightPool
+from pool import PlaywrightPool, ArticleRepository
 import trafilatura
 import io

@ -25,12 +25,14 @@ intents.message_content = True

 bot = discord.Client(intents=intents)

-LOGGER = logging.getLogger("Newsulizer")
+LOGGER = logging.getLogger("main")
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
 )

+article_repository = ArticleRepository()
+
 async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
    fp = io.BytesIO(content.encode("utf-8"))
    file = discord.File(fp, filename=filename)
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
    LOGGER.info("Received URL from %s: %s", message.author, url)

    try:
-        html = await PlaywrightPool.fetch_html(url)
+        processed_html = await article_repository.get_article(url)
        # TODO: parse `html`, summarise, etc.
-        await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
-        await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
+        await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
+        await send_text_file(message.channel, processed_html)
    except:
        LOGGER.exception("Playwright failed")
        await message.channel.send("❌ Sorry, I couldn't fetch that page.")
--- a/news/newsulizer.sqlite3
+++ b/news/newsulizer.sqlite3
--- a/news/pool.py
+++ b/news/pool.py
@ -1,7 +1,25 @@
+from __future__ import annotations
+
 from playwright.async_api import async_playwright, Browser, BrowserContext, Page
-from typing import Final, Optional
 import asyncio

+import os
+import sqlite3
+import trafilatura
+import types
+from typing import Final, Optional, Union, Protocol, Any, Tuple
+import logging
+
+def process_html(html):
+    return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
+                        include_tables=True, include_comments=False, favor_recall=True)
+
+LOGGER = logging.getLogger("pool")
+# logging.basicConfig(
+#     level=logging.INFO,
+#     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
+# )
+
 class PlaywrightPool:
    _pw = None           # playwright instance
    _browser: Optional[Browser] = None
@ -54,4 +72,166 @@ class PlaywrightPool:
                html = await page.content()
                return html
            finally:
-                await page.close()
+                await page.close()
+
+class DBConnectionInfo:
+    def __init__(
+            self,
+            dbname: str,
+            user: str,
+            password: str,
+            host: str = "localhost",
+            port: int = 5432,
+    ) -> None:
+        self.host = host
+        self.port = port
+        self.dbname = dbname
+        self.user = user
+        self.password = password
+
+
+class ArticleRepository:
+    """
+    A very small wrapper around a database that maintains a single table
+    called 'articles' inside a database called 'newsulizer'.
+
+    • If you pass an existing DB-API connection, it will be used as-is.
+    • If you don’t pass anything, a local SQLite file called
+      './newsulizer.sqlite3' is created/used automatically.
+    """
+
+    _CREATE_DB_SQLITE = "newsulizer.sqlite3"
+    _TABLE_NAME = "articles"
+
+    def __init__(
+        self,
+        connection_info: Optional[DBConnectionInfo] = None,
+        sqlite_path: Optional[str] = None,
+    ) -> None:
+        """
+        Parameters
+        ----------
+        sqlite_path:
+            Path to an SQLite file.  Defaults to ./newsulizer.sqlite3
+            when *connection* is omitted.
+        """
+
+        if connection_info is None:
+            sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
+            connection = self._make_sqlite_conn(sqlite_path)
+            self.cursor_type = "?"
+        else:
+            connection = self._make_postgres_conn(
+                host=connection_info.host,
+                port=connection_info.port,
+                dbname=connection_info.dbname,
+                user=connection_info.user,
+                password=connection_info.password,
+            )
+            self.cursor_type = "%s"
+
+        self._conn = connection
+        self._ensure_schema()
+
+        # Protect SQLite (which is not async-safe) by one lock
+        self._lock = asyncio.Lock()
+
+    # ------------------------------------------------------------------ #
+    # public API
+    # ------------------------------------------------------------------ #
+    async def get_article(self, url: str) -> str:
+        """
+        Main entry point.
+        • Returns the processed text if it is already cached.
+        • Otherwise downloads it, processes it, stores it, and returns it.
+        """
+
+        # Single writer at a time when using sqlite3 – avoids `database is locked`
+        async with self._lock:
+            row = self._row_for_url(url)
+
+            if row and row[3]:                          # row = (id, url, raw, processed)
+                LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
+                return row[3]                           # processed_html already present
+
+        LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
+        raw_html = await PlaywrightPool.fetch_html(url)
+        processed_html = process_html(raw_html)
+
+        async with self._lock:
+            # Upsert:
+            self._conn.execute(
+                f"""
+                INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
+                VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
+                ON CONFLICT(url) DO UPDATE SET
+                    raw_html=EXCLUDED.raw_html,
+                    processed_html=EXCLUDED.processed_html
+                """,
+                (url, raw_html, processed_html),
+            )
+            self._conn.commit()
+
+        return processed_html
+
+    def close(self) -> None:
+        """Close the underlying DB connection."""
+        try:
+            self._conn.close()
+        except Exception:
+            pass
+
+    # ------------------------------------------------------------------ #
+    # internals
+    # ------------------------------------------------------------------ #
+    def _ensure_schema(self) -> None:
+        """Create the articles table if it does not yet exist."""
+        # Simple feature detection for DBs that do not support
+        # `ON CONFLICT` (mainly older MySQL) could be added here.
+        self._conn.execute(
+            f"""
+            CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
+                id             INTEGER PRIMARY KEY AUTOINCREMENT,
+                url            TEXT UNIQUE NOT NULL,
+                raw_html       TEXT NOT NULL,
+                processed_html TEXT NOT NULL
+            )
+            """
+        )
+        self._conn.commit()
+
+    def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
+        cur = self._conn.cursor()
+        cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
+        return cur.fetchone()
+
+    @staticmethod
+    def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
+        first_time = not os.path.exists(sqlite_path)
+        connection = sqlite3.connect(sqlite_path, check_same_thread=False)
+        # Enforce basic integrity
+        connection.execute("PRAGMA foreign_keys = ON")
+        connection.execute("PRAGMA busy_timeout = 5000")
+
+        if first_time:
+            # Ensure a human-readable filename, not an unnamed ATTACH
+            LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
+        else:
+            LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
+        return connection
+
+    @staticmethod
+    def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
+        try:
+            import psycopg2
+        except ModuleNotFoundError as exc:
+            raise RuntimeError(
+                "psycopg2 is required for PostgreSQL support – "
+                "run `pip install psycopg2-binary`"
+            ) from exc
+
+        conn = psycopg2.connect(
+            host=host, port=port, dbname=dbname, user=user, password=password
+        )
+        conn.autocommit = False
+        return conn