Compare commits
No commits in common. "de6f9efbedbda6aa68b2eefa2964906e855ef98c" and "48cc36011e9d6670384f754a11e66dbcc4fd906b" have entirely different histories.
de6f9efbed
...
48cc36011e
|
@ -1 +0,0 @@
|
||||||
.env
|
|
|
@ -1 +0,0 @@
|
||||||
*.sqlite3
|
|
Binary file not shown.
30
news/main.py
30
news/main.py
|
@ -8,12 +8,11 @@ from typing import Final, Optional, List
|
||||||
import discord
|
import discord
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import re
|
import re
|
||||||
from pool import PlaywrightPool, ArticleRepository
|
from pool import PlaywrightPool
|
||||||
|
import trafilatura
|
||||||
import io
|
import io
|
||||||
from ollama import chat
|
|
||||||
from ollama import ChatResponse
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||||
from ollama import Client
|
|
||||||
from ollama import AsyncClient
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -26,25 +25,12 @@ intents.message_content = True
|
||||||
|
|
||||||
bot = discord.Client(intents=intents)
|
bot = discord.Client(intents=intents)
|
||||||
|
|
||||||
LOGGER = logging.getLogger("main")
|
LOGGER = logging.getLogger("Newsulizer")
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
)
|
)
|
||||||
|
|
||||||
article_repository = ArticleRepository()
|
|
||||||
|
|
||||||
async def send_chat(messages):
|
|
||||||
return await AsyncClient(host="192.168.69.3:11434").chat(
|
|
||||||
model="deepseek-r1:8b",
|
|
||||||
messages=messages,
|
|
||||||
stream=False,
|
|
||||||
options={
|
|
||||||
'temperature': 0.5,
|
|
||||||
"num_ctx": 128000
|
|
||||||
},
|
|
||||||
think=True)
|
|
||||||
|
|
||||||
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
||||||
fp = io.BytesIO(content.encode("utf-8"))
|
fp = io.BytesIO(content.encode("utf-8"))
|
||||||
file = discord.File(fp, filename=filename)
|
file = discord.File(fp, filename=filename)
|
||||||
|
@ -60,10 +46,10 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
processed_html = await article_repository.get_article(url)
|
html = await PlaywrightPool.fetch_html(url)
|
||||||
# TODO: parse `html`, summarise, etc.
|
# TODO: parse `html`, summarise, etc.
|
||||||
await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
|
await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
|
||||||
await send_text_file(message.channel, processed_html)
|
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
|
||||||
except:
|
except:
|
||||||
LOGGER.exception("Playwright failed")
|
LOGGER.exception("Playwright failed")
|
||||||
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
||||||
|
|
Binary file not shown.
184
news/pool.py
184
news/pool.py
|
@ -1,25 +1,7 @@
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||||
|
from typing import Final, Optional
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
import os
|
|
||||||
import sqlite3
|
|
||||||
import trafilatura
|
|
||||||
import types
|
|
||||||
from typing import Final, Optional, Union, Protocol, Any, Tuple
|
|
||||||
import logging
|
|
||||||
|
|
||||||
def process_html(html):
|
|
||||||
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
|
|
||||||
include_tables=True, include_comments=False, favor_recall=True)
|
|
||||||
|
|
||||||
LOGGER = logging.getLogger("pool")
|
|
||||||
# logging.basicConfig(
|
|
||||||
# level=logging.INFO,
|
|
||||||
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
|
||||||
# )
|
|
||||||
|
|
||||||
class PlaywrightPool:
|
class PlaywrightPool:
|
||||||
_pw = None # playwright instance
|
_pw = None # playwright instance
|
||||||
_browser: Optional[Browser] = None
|
_browser: Optional[Browser] = None
|
||||||
|
@ -72,166 +54,4 @@ class PlaywrightPool:
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
return html
|
return html
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
class DBConnectionInfo:
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
dbname: str,
|
|
||||||
user: str,
|
|
||||||
password: str,
|
|
||||||
host: str = "localhost",
|
|
||||||
port: int = 5432,
|
|
||||||
) -> None:
|
|
||||||
self.host = host
|
|
||||||
self.port = port
|
|
||||||
self.dbname = dbname
|
|
||||||
self.user = user
|
|
||||||
self.password = password
|
|
||||||
|
|
||||||
|
|
||||||
class ArticleRepository:
|
|
||||||
"""
|
|
||||||
A very small wrapper around a database that maintains a single table
|
|
||||||
called 'articles' inside a database called 'newsulizer'.
|
|
||||||
|
|
||||||
• If you pass an existing DB-API connection, it will be used as-is.
|
|
||||||
• If you don’t pass anything, a local SQLite file called
|
|
||||||
'./newsulizer.sqlite3' is created/used automatically.
|
|
||||||
"""
|
|
||||||
|
|
||||||
_CREATE_DB_SQLITE = "newsulizer.sqlite3"
|
|
||||||
_TABLE_NAME = "articles"
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
connection_info: Optional[DBConnectionInfo] = None,
|
|
||||||
sqlite_path: Optional[str] = None,
|
|
||||||
) -> None:
|
|
||||||
"""
|
|
||||||
Parameters
|
|
||||||
----------
|
|
||||||
sqlite_path:
|
|
||||||
Path to an SQLite file. Defaults to ./newsulizer.sqlite3
|
|
||||||
when *connection* is omitted.
|
|
||||||
"""
|
|
||||||
|
|
||||||
if connection_info is None:
|
|
||||||
sqlite_path = sqlite_path or self._CREATE_DB_SQLITE
|
|
||||||
connection = self._make_sqlite_conn(sqlite_path)
|
|
||||||
self.cursor_type = "?"
|
|
||||||
else:
|
|
||||||
connection = self._make_postgres_conn(
|
|
||||||
host=connection_info.host,
|
|
||||||
port=connection_info.port,
|
|
||||||
dbname=connection_info.dbname,
|
|
||||||
user=connection_info.user,
|
|
||||||
password=connection_info.password,
|
|
||||||
)
|
|
||||||
self.cursor_type = "%s"
|
|
||||||
|
|
||||||
self._conn = connection
|
|
||||||
self._ensure_schema()
|
|
||||||
|
|
||||||
# Protect SQLite (which is not async-safe) by one lock
|
|
||||||
self._lock = asyncio.Lock()
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ #
|
|
||||||
# public API
|
|
||||||
# ------------------------------------------------------------------ #
|
|
||||||
async def get_article(self, url: str) -> str:
|
|
||||||
"""
|
|
||||||
Main entry point.
|
|
||||||
• Returns the processed text if it is already cached.
|
|
||||||
• Otherwise downloads it, processes it, stores it, and returns it.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
|
||||||
async with self._lock:
|
|
||||||
row = self._row_for_url(url)
|
|
||||||
|
|
||||||
if row and row[3]: # row = (id, url, raw, processed)
|
|
||||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
|
||||||
return row[3] # processed_html already present
|
|
||||||
|
|
||||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
|
||||||
raw_html = await PlaywrightPool.fetch_html(url)
|
|
||||||
processed_html = process_html(raw_html)
|
|
||||||
|
|
||||||
async with self._lock:
|
|
||||||
# Upsert:
|
|
||||||
self._conn.execute(
|
|
||||||
f"""
|
|
||||||
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
|
|
||||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
|
||||||
raw_html=EXCLUDED.raw_html,
|
|
||||||
processed_html=EXCLUDED.processed_html
|
|
||||||
""",
|
|
||||||
(url, raw_html, processed_html),
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
return processed_html
|
|
||||||
|
|
||||||
def close(self) -> None:
|
|
||||||
"""Close the underlying DB connection."""
|
|
||||||
try:
|
|
||||||
self._conn.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
|
|
||||||
# ------------------------------------------------------------------ #
|
|
||||||
# internals
|
|
||||||
# ------------------------------------------------------------------ #
|
|
||||||
def _ensure_schema(self) -> None:
|
|
||||||
"""Create the articles table if it does not yet exist."""
|
|
||||||
# Simple feature detection for DBs that do not support
|
|
||||||
# `ON CONFLICT` (mainly older MySQL) could be added here.
|
|
||||||
self._conn.execute(
|
|
||||||
f"""
|
|
||||||
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
url TEXT UNIQUE NOT NULL,
|
|
||||||
raw_html TEXT NOT NULL,
|
|
||||||
processed_html TEXT NOT NULL
|
|
||||||
)
|
|
||||||
"""
|
|
||||||
)
|
|
||||||
self._conn.commit()
|
|
||||||
|
|
||||||
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
|
|
||||||
cur = self._conn.cursor()
|
|
||||||
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
|
|
||||||
return cur.fetchone()
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _make_sqlite_conn(sqlite_path: str) -> sqlite3.Connection:
|
|
||||||
first_time = not os.path.exists(sqlite_path)
|
|
||||||
connection = sqlite3.connect(sqlite_path, check_same_thread=False)
|
|
||||||
# Enforce basic integrity
|
|
||||||
connection.execute("PRAGMA foreign_keys = ON")
|
|
||||||
connection.execute("PRAGMA busy_timeout = 5000")
|
|
||||||
|
|
||||||
if first_time:
|
|
||||||
# Ensure a human-readable filename, not an unnamed ATTACH
|
|
||||||
LOGGER.info(f"[ArticleRepository] Created fresh local database at '{sqlite_path}'")
|
|
||||||
else:
|
|
||||||
LOGGER.info(f"[ArticleRepository] Reusing existing local database at '{sqlite_path}'")
|
|
||||||
return connection
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _make_postgres_conn(*, host: str, port: int, dbname: str, user: str, password: Optional[str]):
|
|
||||||
try:
|
|
||||||
import psycopg2
|
|
||||||
except ModuleNotFoundError as exc:
|
|
||||||
raise RuntimeError(
|
|
||||||
"psycopg2 is required for PostgreSQL support – "
|
|
||||||
"run `pip install psycopg2-binary`"
|
|
||||||
) from exc
|
|
||||||
|
|
||||||
conn = psycopg2.connect(
|
|
||||||
host=host, port=port, dbname=dbname, user=user, password=password
|
|
||||||
)
|
|
||||||
conn.autocommit = False
|
|
||||||
return conn
|
|
Loading…
Reference in New Issue