caching with sql
parent
48cc36011e
commit
db391a6163
Binary file not shown.
12
news/main.py
12
news/main.py
|
@ -8,7 +8,7 @@ from typing import Final, Optional, List
|
||||||
import discord
|
import discord
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import re
|
import re
|
||||||
from pool import PlaywrightPool
|
from pool import PlaywrightPool, ArticleRepository
|
||||||
import trafilatura
|
import trafilatura
|
||||||
import io
|
import io
|
||||||
|
|
||||||
|
@ -25,12 +25,14 @@ intents.message_content = True
|
||||||
|
|
||||||
bot = discord.Client(intents=intents)
|
bot = discord.Client(intents=intents)
|
||||||
|
|
||||||
LOGGER = logging.getLogger("Newsulizer")
|
LOGGER = logging.getLogger("main")
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
article_repository = ArticleRepository()
|
||||||
|
|
||||||
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
||||||
fp = io.BytesIO(content.encode("utf-8"))
|
fp = io.BytesIO(content.encode("utf-8"))
|
||||||
file = discord.File(fp, filename=filename)
|
file = discord.File(fp, filename=filename)
|
||||||
|
@ -46,10 +48,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
html = await PlaywrightPool.fetch_html(url)
|
processed_html = await article_repository.get_article(url)
|
||||||
# TODO: parse `html`, summarise, etc.
|
# TODO: parse `html`, summarise, etc.
|
||||||
await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
|
await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
|
||||||
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
|
await send_text_file(message.channel, processed_html)
|
||||||
except:
|
except:
|
||||||
LOGGER.exception("Playwright failed")
|
LOGGER.exception("Playwright failed")
|
||||||
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
||||||
|
|
Binary file not shown.
184
news/pool.py
184
news/pool.py
|
@ -1,7 +1,25 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||||
from typing import Final, Optional
|
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import trafilatura
|
||||||
|
import types
|
||||||
|
from typing import Final, Optional, Union, Protocol, Any, Tuple
|
||||||
|
import logging
|
||||||
|
|
||||||
|
def process_html(html):
|
||||||
|
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
|
||||||
|
include_tables=True, include_comments=False, favor_recall=True)
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger("pool")
|
||||||
|
# logging.basicConfig(
|
||||||
|
# level=logging.INFO,
|
||||||
|
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
# )
|
||||||
|
|
||||||
class PlaywrightPool:
|
class PlaywrightPool:
|
||||||
_pw = None # playwright instance
|
_pw = None # playwright instance
|
||||||
_browser: Optional[Browser] = None
|
_browser: Optional[Browser] = None
|
||||||
|
@ -54,4 +72,166 @@ class PlaywrightPool:
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
return html
|
return html
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
class DBConnectionInfo:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dbname: str,
|
||||||
|
user: str,
|
||||||
|
password: str,
|
||||||
|
host: str = "localhost",
|
||||||
|
port: int = 5432,
|
||||||
|
) -> None:
|
||||||
|
self.host = host
|
||||||
|
self.port = port
|
||||||
|
self.dbname = dbname
|
||||||
|
self.user = user
|
||||||
|
self.password = password
|
||||||
|
|
||||||
|
|
||||||
|
class ArticleRepository:
|
||||||
|
"""
|
||||||
|
A very small wrapper around a database that maintains a single table
|
||||||
|
called 'articles' inside a database called 'newsulizer'.
|
||||||
|
|
||||||
|
• If you pass an existing DB-API connection, it will be used as-is.
|
||||||
|
• If you don’t pass anything, a local SQLite file called
|
||||||
|
'./newsulizer.sqlite3' is created/used automatically.
|
||||||
|
"""
|
||||||
|
|
||||||
|
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
|
||||||
|
_TABLE_NAME = "articles"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
connection_info: Optional[DBConnectionInfo] = None,
|
||||||
|
sqlite_path: Optional[str] = None,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Parameters
|
||||||
|
----------
|
||||||
|
sqlite_path:
|
||||||
|
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
|
||||||
|
when *connection* is omitted.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if connection_info is None:
|
||||||
|
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
|
||||||
|
connection = self._make_sqlite_conn(sqlite_path)
|
||||||
|
self.cursor_type = "?"
|
||||||
|
else:
|
||||||
|
connection = self._make_postgres_conn(
|
||||||
|
host=connection_info.host,
|
||||||
|
port=connection_info.port,
|
||||||
|
dbname=connection_info.dbname,
|
||||||
|
user=connection_info.user,
|
||||||
|
password=connection_info.password,
|
||||||
|
)
|
||||||
|
self.cursor_type = "%s"
|
||||||
|
|
||||||
|
self._conn = connection
|
||||||
|
self._ensure_schema()
|
||||||
|
|
||||||
|
# Protect SQLite (which is not async-safe) by one lock
|
||||||
|
self._lock = asyncio.Lock()
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# public API
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
async def get_article(self, url: str) -> str:
|
||||||
|
"""
|
||||||
|
Main entry point.
|
||||||
|
• Returns the processed text if it is already cached.
|
||||||
|
• Otherwise downloads it, processes it, stores it, and returns it.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
||||||
|
async with self._lock:
|
||||||
|
row = self._row_for_url(url)
|
||||||
|
|
||||||
|
if row and row[3]: # row = (id, url, raw, processed)
|
||||||
|
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||||
|
return row[3] # processed_html already present
|
||||||
|
|
||||||
|
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||||
|
raw_html = await PlaywrightPool.fetch_html(url)
|
||||||
|
processed_html = process_html(raw_html)
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
|
# Upsert:
|
||||||
|
self._conn.execute(
|
||||||
|
f"""
|
||||||
|
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
|
||||||
|
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||||
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
|
raw_html=EXCLUDED.raw_html,
|
||||||
|
processed_html=EXCLUDED.processed_html
|
||||||
|
""",
|
||||||
|
(url, raw_html, processed_html),
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
return processed_html
|
||||||
|
|
||||||
|
def close(self) -> None:
|
||||||
|
"""Close the underlying DB connection."""
|
||||||
|
try:
|
||||||
|
self._conn.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
# internals
|
||||||
|
# ------------------------------------------------------------------ #
|
||||||
|
def _ensure_schema(self) -> None:
|
||||||
|
"""Create the articles table if it does not yet exist."""
|
||||||
|
# Simple feature detection for DBs that do not support
|
||||||
|
# `ON CONFLICT` (mainly older MySQL) could be added here.
|
||||||
|
self._conn.execute(
|
||||||
|
f"""
|
||||||
|
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
url TEXT UNIQUE NOT NULL,
|
||||||
|
raw_html TEXT NOT NULL,
|
||||||
|
processed_html TEXT NOT NULL
|
||||||
|
)
|
||||||
|
"""
|
||||||
|
)
|
||||||
|
self._conn.commit()
|
||||||
|
|
||||||
|
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
|
||||||
|
cur = self._conn.cursor()
|
||||||
|
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
|
||||||
|
return cur.fetchone()
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
|
||||||
|
first_time = not os.path.exists(sqlite_path)
|
||||||
|
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
|
||||||
|
# Enforce basic integrity
|
||||||
|
connection.execute("PRAGMA foreign_keys = ON")
|
||||||
|
connection.execute("PRAGMA busy_timeout = 5000")
|
||||||
|
|
||||||
|
if first_time:
|
||||||
|
# Ensure a human-readable filename, not an unnamed ATTACH
|
||||||
|
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
|
||||||
|
else:
|
||||||
|
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
|
||||||
|
return connection
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
|
||||||
|
try:
|
||||||
|
import psycopg2
|
||||||
|
except ModuleNotFoundError as exc:
|
||||||
|
raise RuntimeError(
|
||||||
|
"psycopg2 is required for PostgreSQL support – "
|
||||||
|
"run `pip install psycopg2-binary`"
|
||||||
|
) from exc
|
||||||
|
|
||||||
|
conn = psycopg2.connect(
|
||||||
|
host=host, port=port, dbname=dbname, user=user, password=password
|
||||||
|
)
|
||||||
|
conn.autocommit = False
|
||||||
|
return conn
|
Loading…
Reference in New Issue