Compare commits

..

No commits in common. "de6f9efbedbda6aa68b2eefa2964906e855ef98c" and "48cc36011e9d6670384f754a11e66dbcc4fd906b" have entirely different histories.

6 changed files with 10 additions and 206 deletions

1
.gitignore vendored
View File

@ -1 +0,0 @@
.env

1
news/.gitignore vendored
View File

@ -1 +0,0 @@
*.sqlite3

View File

@ -8,12 +8,11 @@ from typing import Final, Optional, List
import discord
from dotenv import load_dotenv
import re
from pool import PlaywrightPool, ArticleRepository
from pool import PlaywrightPool
import trafilatura
import io
from ollama import chat
from ollama import ChatResponse
from ollama import Client
from ollama import AsyncClient
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
load_dotenv()
@ -26,25 +25,12 @@ intents.message_content = True
bot = discord.Client(intents=intents)
LOGGER = logging.getLogger("main")
LOGGER = logging.getLogger("Newsulizer")
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
article_repository = ArticleRepository()
async def send_chat(messages):
return await AsyncClient(host="192.168.69.3:11434").chat(
model="deepseek-r1:8b",
messages=messages,
stream=False,
options={
'temperature': 0.5,
"num_ctx": 128000
},
think=True)
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
fp = io.BytesIO(content.encode("utf-8"))
file = discord.File(fp, filename=filename)
@ -60,10 +46,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
LOGGER.info("Received URL from %s: %s", message.author, url)
try:
processed_html = await article_repository.get_article(url)
html = await PlaywrightPool.fetch_html(url)
# TODO: parse `html`, summarise, etc.
await message.channel.send(f"✅ Article downloaded {len(processed_html):,} bytes.")
await send_text_file(message.channel, processed_html)
await message.channel.send(f"✅ Article downloaded {len(html):,} bytes.")
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
except:
LOGGER.exception("Playwright failed")
await message.channel.send("❌ Sorry, I couldn't fetch that page.")

Binary file not shown.

View File

@ -1,25 +1,7 @@
from __future__ import annotations
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
from typing import Final, Optional
import asyncio
import os
import sqlite3
import trafilatura
import types
from typing import Final, Optional, Union, Protocol, Any, Tuple
import logging
def process_html(html):
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
include_tables=True, include_comments=False, favor_recall=True)
LOGGER = logging.getLogger("pool")
# logging.basicConfig(
# level=logging.INFO,
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
# )
class PlaywrightPool:
_pw = None # playwright instance
_browser: Optional[Browser] = None
@ -73,165 +55,3 @@ class PlaywrightPool:
return html
finally:
await page.close()
class DBConnectionInfo:
def __init__(
self,
dbname: str,
user: str,
password: str,
host: str = "localhost",
port: int = 5432,
) -> None:
self.host = host
self.port = port
self.dbname = dbname
self.user = user
self.password = password
class ArticleRepository:
"""
A very small wrapper around a database that maintains a single table
called 'articles' inside a database called 'newsulizer'.
If you pass an existing DB-API connection, it will be used as-is.
If you dont pass anything, a local SQLite file called
'./newsulizer.sqlite3' is created/used automatically.
"""
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
_TABLE_NAME = "articles"
def __init__(
self,
connection_info: Optional[DBConnectionInfo] = None,
sqlite_path: Optional[str] = None,
) -> None:
"""
Parameters
----------
sqlite_path:
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
when *connection* is omitted.
"""
if connection_info is None:
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
connection = self._make_sqlite_conn(sqlite_path)
self.cursor_type = "?"
else:
connection = self._make_postgres_conn(
host=connection_info.host,
port=connection_info.port,
dbname=connection_info.dbname,
user=connection_info.user,
password=connection_info.password,
)
self.cursor_type = "%s"
self._conn = connection
self._ensure_schema()
# Protect SQLite (which is not async-safe) by one lock
self._lock = asyncio.Lock()
# ------------------------------------------------------------------ #
# public API
# ------------------------------------------------------------------ #
async def get_article(self, url: str) -> str:
"""
Main entry point.
Returns the processed text if it is already cached.
Otherwise downloads it, processes it, stores it, and returns it.
"""
# Single writer at a time when using sqlite3 avoids `database is locked`
async with self._lock:
row = self._row_for_url(url)
if row and row[3]: # row = (id, url, raw, processed)
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
return row[3] # processed_html already present
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
raw_html = await PlaywrightPool.fetch_html(url)
processed_html = process_html(raw_html)
async with self._lock:
# Upsert:
self._conn.execute(
f"""
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
ON CONFLICT(url) DO UPDATE SET
raw_html=EXCLUDED.raw_html,
processed_html=EXCLUDED.processed_html
""",
(url, raw_html, processed_html),
)
self._conn.commit()
return processed_html
def close(self) -> None:
"""Close the underlying DB connection."""
try:
self._conn.close()
except Exception:
pass
# ------------------------------------------------------------------ #
# internals
# ------------------------------------------------------------------ #
def _ensure_schema(self) -> None:
"""Create the articles table if it does not yet exist."""
# Simple feature detection for DBs that do not support
# `ON CONFLICT` (mainly older MySQL) could be added here.
self._conn.execute(
f"""
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
raw_html TEXT NOT NULL,
processed_html TEXT NOT NULL
)
"""
)
self._conn.commit()
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
cur = self._conn.cursor()
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
return cur.fetchone()
@staticmethod
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
first_time = not os.path.exists(sqlite_path)
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
# Enforce basic integrity
connection.execute("PRAGMA foreign_keys = ON")
connection.execute("PRAGMA busy_timeout = 5000")
if first_time:
# Ensure a human-readable filename, not an unnamed ATTACH
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
else:
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
return connection
@staticmethod
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
try:
import psycopg2
except ModuleNotFoundError as exc:
raise RuntimeError(
"psycopg2 is required for PostgreSQL support "
"run `pip install psycopg2-binary`"
) from exc
conn = psycopg2.connect(
host=host, port=port, dbname=dbname, user=user, password=password
)
conn.autocommit = False
return conn