diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 98fda27..7da7b5e 100644 Binary files a/news/__pycache__/pool.cpython-311.pyc and b/news/__pycache__/pool.cpython-311.pyc differ diff --git a/news/main.py b/news/main.py index dfbecb3..528f2a2 100644 --- a/news/main.py +++ b/news/main.py @@ -8,7 +8,7 @@ from typing import Final, Optional, List import discord from dotenv import load_dotenv import re -from pool import PlaywrightPool +from pool import PlaywrightPool, ArticleRepository import trafilatura import io @@ -25,12 +25,14 @@ intents.message_content = True bot = discord.Client(intents=intents) -LOGGER = logging.getLogger("Newsulizer") +LOGGER = logging.getLogger("main") logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", ) +article_repository = ArticleRepository() + async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None: fp = io.BytesIO(content.encode("utf-8")) file = discord.File(fp, filename=filename) @@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info("Received URL from %s: %s", message.author, url) try: - html = await PlaywrightPool.fetch_html(url) + processed_html = await article_repository.get_article(url) # TODO: parse `html`, summarise, etc. - await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.") - await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True)) + await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.") + await send_text_file(message.channel, processed_html) except: LOGGER.exception("Playwright failed") await message.channel.send("❌ Sorry, I couldn't fetch that page.") diff --git a/news/newsulizer.sqlite3 b/news/newsulizer.sqlite3 new file mode 100644 index 0000000..feb5b52 Binary files /dev/null and b/news/newsulizer.sqlite3 differ diff --git a/news/pool.py b/news/pool.py index 9a0b847..a858e94 100644 --- a/news/pool.py +++ b/news/pool.py @@ -1,7 +1,25 @@ +from __future__ import annotations + from playwright.async_api import async_playwright, Browser, BrowserContext, Page -from typing import Final, Optional import asyncio +import os +import sqlite3 +import trafilatura +import types +from typing import Final, Optional, Union, Protocol, Any, Tuple +import logging + +def process_html(html): + return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, + include_tables=True, include_comments=False, favor_recall=True) + +LOGGER = logging.getLogger("pool") +# logging.basicConfig( +# level=logging.INFO, +# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +# ) + class PlaywrightPool: _pw = None # playwright instance _browser: Optional[Browser] = None @@ -54,4 +72,166 @@ class PlaywrightPool: html = await page.content() return html finally: - await page.close() \ No newline at end of file + await page.close() + +class DBConnectionInfo: + def __init__( + self, + dbname: str, + user: str, + password: str, + host: str = "localhost", + port: int = 5432, + ) -> None: + self.host = host + self.port = port + self.dbname = dbname + self.user = user + self.password = password + + +class ArticleRepository: + """ + A very small wrapper around a database that maintains a single table + called 'articles' inside a database called 'newsulizer'. + + • If you pass an existing DB-API connection, it will be used as-is. + • If you don’t pass anything, a local SQLite file called + './newsulizer.sqlite3' is created/used automatically. + """ + + _CREATE_DB_SQLITE = "newsulizer.sqlite3" + _TABLE_NAME = "articles" + + def __init__( + self, + connection_info: Optional[DBConnectionInfo] = None, + sqlite_path: Optional[str] = None, + ) -> None: + """ + Parameters + ---------- + sqlite_path: + Path to an SQLite file. Defaults to ./newsulizer.sqlite3 + when *connection* is omitted. + """ + + if connection_info is None: + sqlite_path = sqlite_path or self._CREATE_DB_SQLITE + connection = self._make_sqlite_conn(sqlite_path) + self.cursor_type = "?" + else: + connection = self._make_postgres_conn( + host=connection_info.host, + port=connection_info.port, + dbname=connection_info.dbname, + user=connection_info.user, + password=connection_info.password, + ) + self.cursor_type = "%s" + + self._conn = connection + self._ensure_schema() + + # Protect SQLite (which is not async-safe) by one lock + self._lock = asyncio.Lock() + + # ------------------------------------------------------------------ # + # public API + # ------------------------------------------------------------------ # + async def get_article(self, url: str) -> str: + """ + Main entry point. + • Returns the processed text if it is already cached. + • Otherwise downloads it, processes it, stores it, and returns it. + """ + + # Single writer at a time when using sqlite3 – avoids `database is locked` + async with self._lock: + row = self._row_for_url(url) + + if row and row[3]: # row = (id, url, raw, processed) + LOGGER.info(f"[ArticleRepository] Found cached article for {url}") + return row[3] # processed_html already present + + LOGGER.info(f"[ArticleRepository] Downloading article for {url}") + raw_html = await PlaywrightPool.fetch_html(url) + processed_html = process_html(raw_html) + + async with self._lock: + # Upsert: + self._conn.execute( + f""" + INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html) + VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}) + ON CONFLICT(url) DO UPDATE SET + raw_html=EXCLUDED.raw_html, + processed_html=EXCLUDED.processed_html + """, + (url, raw_html, processed_html), + ) + self._conn.commit() + + return processed_html + + def close(self) -> None: + """Close the underlying DB connection.""" + try: + self._conn.close() + except Exception: + pass + + # ------------------------------------------------------------------ # + # internals + # ------------------------------------------------------------------ # + def _ensure_schema(self) -> None: + """Create the articles table if it does not yet exist.""" + # Simple feature detection for DBs that do not support + # `ON CONFLICT` (mainly older MySQL) could be added here. + self._conn.execute( + f""" + CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + url TEXT UNIQUE NOT NULL, + raw_html TEXT NOT NULL, + processed_html TEXT NOT NULL + ) + """ + ) + self._conn.commit() + + def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]: + cur = self._conn.cursor() + cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,)) + return cur.fetchone() + + @staticmethod + def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection: + first_time = not os.path.exists(sqlite_path) + connection = sqlite3.connect(sqlite_path, check_same_thread=False) + # Enforce basic integrity + connection.execute("PRAGMA foreign_keys = ON") + connection.execute("PRAGMA busy_timeout = 5000") + + if first_time: + # Ensure a human-readable filename, not an unnamed ATTACH + LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'") + else: + LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'") + return connection + + @staticmethod + def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]): + try: + import psycopg2 + except ModuleNotFoundError as exc: + raise RuntimeError( + "psycopg2 is required for PostgreSQL support – " + "run `pip install psycopg2-binary`" + ) from exc + + conn = psycopg2.connect( + host=host, port=port, dbname=dbname, user=user, password=password + ) + conn.autocommit = False + return conn \ No newline at end of file