screw it i think im done with this.
parent
329ffc8c8c
commit
301483810e
Binary file not shown.
Binary file not shown.
|
@ -196,7 +196,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
|||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||
|
||||
try:
|
||||
title, processed_html = await server.article_repository.get_article(url)
|
||||
title, processed_html = await server.article_repository.get_article_async(url)
|
||||
|
||||
if await server.article_repository.has_paragraphs(url):
|
||||
await message.channel.send("This article has already been processed.")
|
||||
|
|
86
news/pool.py
86
news/pool.py
|
@ -15,10 +15,6 @@ def process_html(html):
|
|||
include_tables=True, include_comments=False, favor_recall=True)
|
||||
|
||||
LOGGER = logging.getLogger("pool")
|
||||
# logging.basicConfig(
|
||||
# level=logging.INFO,
|
||||
# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||
# )
|
||||
|
||||
class PlaywrightPool:
|
||||
_pw = None # playwright instance
|
||||
|
@ -140,7 +136,46 @@ class ArticleRepository:
|
|||
# ------------------------------------------------------------------ #
|
||||
# public API
|
||||
# ------------------------------------------------------------------ #
|
||||
async def get_article(self, url: str) -> tuple[str, str]:
|
||||
|
||||
async def get_article_async(self, url: str) -> tuple[str, str]:
|
||||
async with self._lock:
|
||||
result = self._get_article(url)
|
||||
if result:
|
||||
return result
|
||||
|
||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||
title, raw_html = await PlaywrightPool.fetch_html(url)
|
||||
processed_html = process_html(raw_html)
|
||||
|
||||
# Upsert:
|
||||
self._conn.execute(
|
||||
f"""
|
||||
INSERT INTO articles (url, title, raw_html, processed_html)
|
||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
title=EXCLUDED.title,
|
||||
raw_html=EXCLUDED.raw_html,
|
||||
processed_html=EXCLUDED.processed_html
|
||||
""",
|
||||
(url, title, raw_html, processed_html),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
return title, processed_html
|
||||
|
||||
def get_article(self, url: str) -> tuple[str, str] | None:
|
||||
try:
|
||||
self._lock.acquire()
|
||||
return self._get_article(url)
|
||||
except Exception as exc:
|
||||
LOGGER.exception(f"[ArticleRepository] Error while getting article for {url}")
|
||||
LOGGER.exception(exc)
|
||||
return None
|
||||
finally:
|
||||
if self._lock.locked():
|
||||
self._lock.release()
|
||||
|
||||
def _get_article(self, url: str) -> tuple[str, str] | None:
|
||||
"""
|
||||
Main entry point.
|
||||
• Returns the processed text if it is already cached.
|
||||
|
@ -148,33 +183,14 @@ class ArticleRepository:
|
|||
"""
|
||||
|
||||
# Single writer at a time when using sqlite3 – avoids `database is locked`
|
||||
async with self._lock:
|
||||
row = self._row_for_url(url)
|
||||
row = self._row_for_url(url)
|
||||
|
||||
if row: # row = (id, url, title, raw, processed)
|
||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||
return row[2], row[4] # processed_html already present
|
||||
if row: # row = (id, url, title, raw, processed)
|
||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||
return row[2], row[4] # processed_html already present
|
||||
|
||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||
title, raw_html = await PlaywrightPool.fetch_html(url)
|
||||
processed_html = process_html(raw_html)
|
||||
|
||||
async with self._lock:
|
||||
# Upsert:
|
||||
self._conn.execute(
|
||||
f"""
|
||||
INSERT INTO articles (url, title, raw_html, processed_html)
|
||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||
ON CONFLICT(url) DO UPDATE SET
|
||||
title=EXCLUDED.title,
|
||||
raw_html=EXCLUDED.raw_html,
|
||||
processed_html=EXCLUDED.processed_html
|
||||
""",
|
||||
(url, title, raw_html, processed_html),
|
||||
)
|
||||
self._conn.commit()
|
||||
|
||||
return title, processed_html
|
||||
LOGGER.info(f"[ArticleRepository] Article was not found for {url}")
|
||||
return None
|
||||
|
||||
async def has_paragraphs(self, url) -> bool:
|
||||
async with self._lock:
|
||||
|
@ -190,6 +206,16 @@ class ArticleRepository:
|
|||
return False
|
||||
return True
|
||||
|
||||
def get_latest_articles(self, count):
|
||||
try:
|
||||
self._lock.acquire()
|
||||
cur = self._conn.cursor()
|
||||
row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,))
|
||||
|
||||
return row.fetchall()
|
||||
finally:
|
||||
self._lock.release()
|
||||
|
||||
async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings):
|
||||
async with self._lock:
|
||||
article_id = self._row_for_url(url)[0]
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
from flask import Flask, request, jsonify, abort
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
# Import the repository class from the existing code base.
|
||||
# Adjust the relative import path if pool.py lives in a package.
|
||||
|
@ -9,12 +10,10 @@ app = Flask(__name__)
|
|||
|
||||
article_repository = ArticleRepository()
|
||||
|
||||
LOGGER = logging.getLogger("server")
|
||||
|
||||
@app.route("/articles/<article_url>", methods=["GET"])
|
||||
def get_article(article_url: str):
|
||||
"""
|
||||
Fetch one article by its numeric primary key.
|
||||
Responds with the whole row in JSON or 404 if not present.
|
||||
"""
|
||||
article = article_repository.get_article(article_url)
|
||||
if article is None:
|
||||
abort(404, description="Article not found")
|
||||
|
@ -22,16 +21,12 @@ def get_article(article_url: str):
|
|||
|
||||
@app.route("/article-by-url", methods=["GET"])
|
||||
def get_article_by_url():
|
||||
"""
|
||||
Same as above but lets a client specify the canonical URL instead of the ID:
|
||||
|
||||
GET /article-by-url?url=https://example.com/foo
|
||||
"""
|
||||
url = request.args.get("url")
|
||||
if not url:
|
||||
abort(400, description="`url` query parameter is required")
|
||||
|
||||
article = await article_repository.get_article(url)
|
||||
LOGGER.info(f"Fetching article by URL: {url}")
|
||||
article = article_repository.get_article(url)
|
||||
if article is None:
|
||||
abort(404, description="Article not found")
|
||||
return jsonify(article)
|
||||
|
|
Loading…
Reference in New Issue