Compare commits
No commits in common. "5db7b20e9fdbe2af8618839f3a24b96bd2bac2fe" and "329ffc8c8cd3cba664dfefe08c55d4f69aab3c27" have entirely different histories.
5db7b20e9f
...
329ffc8c8c
Binary file not shown.
Binary file not shown.
26
news/main.py
26
news/main.py
|
@ -196,13 +196,13 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
title, processed_html = await server.article_repository.get_article(url)
|
||||||
|
|
||||||
if await server.article_repository.has_paragraphs(url):
|
if await server.article_repository.has_paragraphs(url):
|
||||||
await message.channel.send("This article has already been processed.")
|
await message.channel.send("This article has already been processed.")
|
||||||
LOGGER.info(f"Article {url} already processed")
|
LOGGER.info(f"Article {url} already processed")
|
||||||
return
|
return
|
||||||
|
|
||||||
title, processed_html = await server.article_repository.fetch_article(url)
|
|
||||||
|
|
||||||
LOGGER.info(f"Article {url} has not been processed. Beginning now!")
|
LOGGER.info(f"Article {url} has not been processed. Beginning now!")
|
||||||
|
|
||||||
summary_bot = ChatBot(summary_system_prompt)
|
summary_bot = ChatBot(summary_system_prompt)
|
||||||
|
@ -347,24 +347,22 @@ async def on_message(message: discord.Message) -> None:
|
||||||
# Launch the processing task without blocking Discord’s event loop
|
# Launch the processing task without blocking Discord’s event loop
|
||||||
asyncio.create_task(handle_article_url(message, url))
|
asyncio.create_task(handle_article_url(message, url))
|
||||||
|
|
||||||
async def start_discord():
|
def _run_flask_blocking() -> NoReturn: # helper returns never
|
||||||
await bot.start(DISCORD_TOKEN)
|
server.app.run(host="0.0.0.0", port=8000, debug=False, use_reloader=False)
|
||||||
|
|
||||||
async def main():
|
|
||||||
|
def main() -> None:
|
||||||
if DISCORD_TOKEN is None:
|
if DISCORD_TOKEN is None:
|
||||||
raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")
|
raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")
|
||||||
|
|
||||||
try:
|
thread = threading.Thread(target=_run_flask_blocking, daemon=True, name="flask-api")
|
||||||
web_task = server.app.run_task(host="0.0.0.0", port=8000, debug=False)
|
thread.start()
|
||||||
discord_task = start_discord()
|
|
||||||
|
|
||||||
await asyncio.gather(web_task, discord_task)
|
try:
|
||||||
|
bot.run(DISCORD_TOKEN)
|
||||||
finally:
|
finally:
|
||||||
await PlaywrightPool.stop()
|
asyncio.run(PlaywrightPool.stop())
|
||||||
server.article_repository.close()
|
server.article_repository.close()
|
||||||
|
|
||||||
if not bot.is_closed():
|
|
||||||
await bot.close()
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
main()
|
||||||
|
|
65
news/pool.py
65
news/pool.py
|
@ -15,6 +15,10 @@ def process_html(html):
|
||||||
include_tables=True, include_comments=False, favor_recall=True)
|
include_tables=True, include_comments=False, favor_recall=True)
|
||||||
|
|
||||||
LOGGER = logging.getLogger("pool")
|
LOGGER = logging.getLogger("pool")
|
||||||
|
# logging.basicConfig(
|
||||||
|
# level=logging.INFO,
|
||||||
|
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
# )
|
||||||
|
|
||||||
class PlaywrightPool:
|
class PlaywrightPool:
|
||||||
_pw = None # playwright instance
|
_pw = None # playwright instance
|
||||||
|
@ -136,47 +140,41 @@ class ArticleRepository:
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
# public API
|
# public API
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
|
async def get_article(self, url: str) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Main entry point.
|
||||||
|
• Returns the processed text if it is already cached.
|
||||||
|
• Otherwise downloads it, processes it, stores it, and returns it.
|
||||||
|
"""
|
||||||
|
|
||||||
async def fetch_article(self, url: str) -> tuple[str, str]:
|
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
result = self._get_article(url)
|
row = self._row_for_url(url)
|
||||||
if result:
|
|
||||||
return result
|
|
||||||
|
|
||||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
if row: # row = (id, url, title, raw, processed)
|
||||||
title, raw_html = await PlaywrightPool.fetch_html(url)
|
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||||
processed_html = process_html(raw_html)
|
return row[2], row[4] # processed_html already present
|
||||||
|
|
||||||
|
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||||
|
title, raw_html = await PlaywrightPool.fetch_html(url)
|
||||||
|
processed_html = process_html(raw_html)
|
||||||
|
|
||||||
|
async with self._lock:
|
||||||
# Upsert:
|
# Upsert:
|
||||||
self._conn.execute(
|
self._conn.execute(
|
||||||
f"""
|
f"""
|
||||||
INSERT INTO articles (url, title, raw_html, processed_html)
|
INSERT INTO articles (url, title, raw_html, processed_html)
|
||||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
title=EXCLUDED.title,
|
title=EXCLUDED.title,
|
||||||
raw_html=EXCLUDED.raw_html,
|
raw_html=EXCLUDED.raw_html,
|
||||||
processed_html=EXCLUDED.processed_html
|
processed_html=EXCLUDED.processed_html
|
||||||
""",
|
""",
|
||||||
(url, title, raw_html, processed_html),
|
(url, title, raw_html, processed_html),
|
||||||
)
|
)
|
||||||
self._conn.commit()
|
self._conn.commit()
|
||||||
|
|
||||||
return title, processed_html
|
return title, processed_html
|
||||||
|
|
||||||
async def get_article(self, url: str) -> tuple[str, str] | None:
|
|
||||||
async with self._lock:
|
|
||||||
return self._get_article(url)
|
|
||||||
|
|
||||||
def _get_article(self, url: str) -> tuple[str, str] | None:
|
|
||||||
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
|
||||||
row = self._row_for_url(url)
|
|
||||||
|
|
||||||
if row: # row = (id, url, title, raw, processed)
|
|
||||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
|
||||||
return row[2], row[4] # processed_html already present
|
|
||||||
|
|
||||||
LOGGER.info(f"[ArticleRepository] Article was not found for {url}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def has_paragraphs(self, url) -> bool:
|
async def has_paragraphs(self, url) -> bool:
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
|
@ -192,13 +190,6 @@ class ArticleRepository:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
async def get_latest_articles(self, count):
|
|
||||||
async with self._lock:
|
|
||||||
cur = self._conn.cursor()
|
|
||||||
row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,))
|
|
||||||
|
|
||||||
return row.fetchall()
|
|
||||||
|
|
||||||
async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings):
|
async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings):
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
article_id = self._row_for_url(url)[0]
|
article_id = self._row_for_url(url)[0]
|
||||||
|
|
|
@ -1,35 +1,36 @@
|
||||||
from quart import Quart, request, jsonify, abort
|
from flask import Flask, request, jsonify, abort
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
import logging
|
|
||||||
|
|
||||||
# Import the repository class from the existing code base.
|
# Import the repository class from the existing code base.
|
||||||
# Adjust the relative import path if pool.py lives in a package.
|
# Adjust the relative import path if pool.py lives in a package.
|
||||||
from pool import ArticleRepository
|
from pool import ArticleRepository
|
||||||
|
|
||||||
app = Quart(__name__)
|
app = Flask(__name__)
|
||||||
|
|
||||||
article_repository = ArticleRepository()
|
article_repository = ArticleRepository()
|
||||||
|
|
||||||
LOGGER = logging.getLogger("server")
|
|
||||||
|
|
||||||
@app.route("/health")
|
|
||||||
async def health():
|
|
||||||
return {"status": "ok"}
|
|
||||||
|
|
||||||
@app.route("/articles/<article_url>", methods=["GET"])
|
@app.route("/articles/<article_url>", methods=["GET"])
|
||||||
async def get_article(article_url: str):
|
def get_article(article_url: str):
|
||||||
article = await article_repository.get_article(article_url)
|
"""
|
||||||
|
Fetch one article by its numeric primary key.
|
||||||
|
Responds with the whole row in JSON or 404 if not present.
|
||||||
|
"""
|
||||||
|
article = article_repository.get_article(article_url)
|
||||||
if article is None:
|
if article is None:
|
||||||
abort(404, description="Article not found")
|
abort(404, description="Article not found")
|
||||||
return jsonify(article)
|
return jsonify(article)
|
||||||
|
|
||||||
@app.route("/article-by-url", methods=["GET"])
|
@app.route("/article-by-url", methods=["GET"])
|
||||||
async def get_article_by_url():
|
def get_article_by_url():
|
||||||
|
"""
|
||||||
|
Same as above but lets a client specify the canonical URL instead of the ID:
|
||||||
|
|
||||||
|
GET /article-by-url?url=https://example.com/foo
|
||||||
|
"""
|
||||||
url = request.args.get("url")
|
url = request.args.get("url")
|
||||||
if not url:
|
if not url:
|
||||||
abort(400, description="`url` query parameter is required")
|
abort(400, description="`url` query parameter is required")
|
||||||
|
|
||||||
LOGGER.info(f"Fetching article by URL: {url}")
|
|
||||||
article = await article_repository.get_article(url)
|
article = await article_repository.get_article(url)
|
||||||
if article is None:
|
if article is None:
|
||||||
abort(404, description="Article not found")
|
abort(404, description="Article not found")
|
||||||
|
|
|
@ -10,7 +10,6 @@ in pkgs.mkShell {
|
||||||
trafilatura
|
trafilatura
|
||||||
playwright
|
playwright
|
||||||
flask
|
flask
|
||||||
quart
|
|
||||||
]))
|
]))
|
||||||
];
|
];
|
||||||
propagatedBuildInputs = with pkgs; [
|
propagatedBuildInputs = with pkgs; [
|
||||||
|
|
Loading…
Reference in New Issue