caching with sql
parent
48cc36011e
commit
db391a6163
Binary file not shown.
12
news/main.py
12
news/main.py
|
@ -8,7 +8,7 @@ from typing import Final, Optional, List
|
|||
import discord
|
||||
from dotenv import load_dotenv
|
||||
import re
|
||||
from pool import PlaywrightPool
|
||||
from pool import PlaywrightPool, ArticleRepository
|
||||
import trafilatura
|
||||
import io
|
||||
|
||||
|
@ -25,12 +25,14 @@ intents.message_content = True
|
|||
|
||||
bot = discord.Client(intents=intents)
|
||||
|
||||
LOGGER = logging.getLogger("Newsulizer")
|
||||
LOGGER = logging.getLogger("main")
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
)
|
||||
|
||||
article_repository = ArticleRepository()
|
||||
|
||||
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
||||
fp = io.BytesIO(content.encode("utf-8"))
|
||||
file = discord.File(fp, filename=filename)
|
||||
|
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
|||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||
|
||||
try:
|
||||
html = await PlaywrightPool.fetch_html(url)
|
||||
processed_html = await article_repository.get_article(url)
|
||||
# TODO: parse `html`, summarise, etc.
|
||||
await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
|
||||
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
|
||||
await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
|
||||
await send_text_file(message.channel, processed_html)
|
||||
except:
|
||||
LOGGER.exception("Playwright failed")
|
||||
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
||||
|
|
Binary file not shown.
184
news/pool.py
184
news/pool.py
|
@ -1,7 +1,25 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||
from typing import Final, Optional
|
||||
import asyncio
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
import trafilatura
|
||||
import types
|
||||
from typing import Final, Optional, Union, Protocol, Any, Tuple
|
||||
import logging
|
||||
|
||||
def process_html(html):
|
||||
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
|
||||
include_tables=True, include_comments=False, favor_recall=True)
|
||||
|
||||
LOGGER = logging.getLogger("pool")
|
||||
# logging.basicConfig(
|
||||
# level=logging.INFO,
|
||||
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
# )
|
||||
|
||||
class PlaywrightPool:
|
||||
_pw = None # playwright instance
|
||||
_browser: Optional[Browser] = None
|
||||
|
@ -54,4 +72,166 @@ class PlaywrightPool:
|
|||
html = await page.content()
|
||||
return html
|
||||
finally:
|
||||
await page.close()
|
||||
await page.close()
|
||||
|
||||
class DBConnectionInfo:
|
||||
def __init__(
|
||||
self,
|
||||
dbname: str,
|
||||
user: str,
|
||||
password: str,
|
||||
host: str = "localhost",
|
||||
port: int = 5432,
|
||||
) -> None:
|
||||
self.host = host
|
||||
self.port = port
|
||||
self.dbname = dbname
|
||||
self.user = user
|
||||
self.password = password
|
||||
|
||||
|
||||
class ArticleRepository:
|
||||
"""
|
||||
A very small wrapper around a database that maintains a single table
|
||||
called 'articles' inside a database called 'newsulizer'.
|
||||
|
||||
• If you pass an existing DB-API connection, it will be used as-is.
|
||||
• If you don’t pass anything, a local SQLite file called
|
||||
'./newsulizer.sqlite3' is created/used automatically.
|
||||
"""
|
||||
|
||||
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
|
||||
_TABLE_NAME = "articles"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
connection_info: Optional[DBConnectionInfo] = None,
|
||||
sqlite_path: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Parameters
|
||||
----------
|
||||
sqlite_path:
|
||||
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
|
||||
when *connection* is omitted.
|
||||
"""
|
||||
|
||||
if connection_info is None:
|
||||
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
|
||||
connection = self._make_sqlite_conn(sqlite_path)
|
||||
self.cursor_type = "?"
|
||||
else:
|
||||
connection = self._make_postgres_conn(
|
||||
host=connection_info.host,
|
||||
port=connection_info.port,
|
||||
dbname=connection_info.dbname,
|
||||
user=connection_info.user,
|
||||
password=connection_info.password,
|
||||
)
|
||||
self.cursor_type = "%s"
|
||||
|
||||
self._conn = connection
|
||||
self._ensure_schema()
|
||||
|
||||
# Protect SQLite (which is not async-safe) by one lock
|
||||
self._lock = asyncio.Lock()
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# public API
|
||||
# ------------------------------------------------------------------ #
|
||||
async def get_article(self, url: str) -> str:
|
||||
"""
|
||||
Main entry point.
|
||||
• Returns the processed text if it is already cached.
|
||||
• Otherwise downloads it, processes it, stores it, and returns it.
|
||||
"""
|
||||
|
||||
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
||||
async with self._lock:
|
||||
row = self._row_for_url(url)
|
||||
|
||||
if row and row[3]: # row = (id, url, raw, processed)
|
||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||
return row[3] # processed_html already present
|
||||
|
||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||
raw_html = await PlaywrightPool.fetch_html(url)
|
||||
processed_html = process_html(raw_html)
|
||||
|
||||
async with self._lock:
|
||||
# Upsert:
|
||||
self._conn.execute(
|
||||
f"""
|
||||
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
|
||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
raw_html=EXCLUDED.raw_html,
|
||||
processed_html=EXCLUDED.processed_html
|
||||
""",
|
||||
(url, raw_html, processed_html),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
return processed_html
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close the underlying DB connection."""
|
||||
try:
|
||||
self._conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# ------------------------------------------------------------------ #
|
||||
# internals
|
||||
# ------------------------------------------------------------------ #
|
||||
def _ensure_schema(self) -> None:
|
||||
"""Create the articles table if it does not yet exist."""
|
||||
# Simple feature detection for DBs that do not support
|
||||
# `ON CONFLICT` (mainly older MySQL) could be added here.
|
||||
self._conn.execute(
|
||||
f"""
|
||||
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
url TEXT UNIQUE NOT NULL,
|
||||
raw_html TEXT NOT NULL,
|
||||
processed_html TEXT NOT NULL
|
||||
)
|
||||
"""
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
|
||||
cur = self._conn.cursor()
|
||||
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
|
||||
return cur.fetchone()
|
||||
|
||||
@staticmethod
|
||||
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
|
||||
first_time = not os.path.exists(sqlite_path)
|
||||
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
|
||||
# Enforce basic integrity
|
||||
connection.execute("PRAGMA foreign_keys = ON")
|
||||
connection.execute("PRAGMA busy_timeout = 5000")
|
||||
|
||||
if first_time:
|
||||
# Ensure a human-readable filename, not an unnamed ATTACH
|
||||
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
|
||||
else:
|
||||
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
|
||||
return connection
|
||||
|
||||
@staticmethod
|
||||
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
|
||||
try:
|
||||
import psycopg2
|
||||
except ModuleNotFoundError as exc:
|
||||
raise RuntimeError(
|
||||
"psycopg2 is required for PostgreSQL support – "
|
||||
"run `pip install psycopg2-binary`"
|
||||
) from exc
|
||||
|
||||
conn = psycopg2.connect(
|
||||
host=host, port=port, dbname=dbname, user=user, password=password
|
||||
)
|
||||
conn.autocommit = False
|
||||
return conn
|
Loading…
Reference in New Issue