diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 02e1dfc..e6ee685 100644 Binary files a/news/__pycache__/pool.cpython-311.pyc and b/news/__pycache__/pool.cpython-311.pyc differ diff --git a/news/__pycache__/server.cpython-311.pyc b/news/__pycache__/server.cpython-311.pyc new file mode 100644 index 0000000..7b9411f Binary files /dev/null and b/news/__pycache__/server.cpython-311.pyc differ diff --git a/news/main.py b/news/main.py index 0840374..c89ef45 100644 --- a/news/main.py +++ b/news/main.py @@ -196,7 +196,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info("Received URL from %s: %s", message.author, url) try: - title, processed_html = await server.article_repository.get_article(url) + title, processed_html = await server.article_repository.get_article_async(url) if await server.article_repository.has_paragraphs(url): await message.channel.send("This article has already been processed.") diff --git a/news/pool.py b/news/pool.py index 78cdc62..be99a49 100644 --- a/news/pool.py +++ b/news/pool.py @@ -15,10 +15,6 @@ def process_html(html): include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool") -# logging.basicConfig( -# level=logging.INFO, -# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", -# ) class PlaywrightPool: _pw = None # playwright instance @@ -140,7 +136,46 @@ class ArticleRepository: # ------------------------------------------------------------------ # # public API # ------------------------------------------------------------------ # - async def get_article(self, url: str) -> tuple[str, str]: + + async def get_article_async(self, url: str) -> tuple[str, str]: + async with self._lock: + result = self._get_article(url) + if result: + return result + + LOGGER.info(f"[ArticleRepository] Downloading article for {url}") + title, raw_html = await PlaywrightPool.fetch_html(url) + processed_html = process_html(raw_html) + + # Upsert: + self._conn.execute( + f""" + INSERT INTO articles (url, title, raw_html, processed_html) + VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type}) + ON CONFLICT(url) DO UPDATE SET + title=EXCLUDED.title, + raw_html=EXCLUDED.raw_html, + processed_html=EXCLUDED.processed_html + """, + (url, title, raw_html, processed_html), + ) + self._conn.commit() + + return title, processed_html + + def get_article(self, url: str) -> tuple[str, str] | None: + try: + self._lock.acquire() + return self._get_article(url) + except Exception as exc: + LOGGER.exception(f"[ArticleRepository] Error while getting article for {url}") + LOGGER.exception(exc) + return None + finally: + if self._lock.locked(): + self._lock.release() + + def _get_article(self, url: str) -> tuple[str, str] | None: """ Main entry point. • Returns the processed text if it is already cached. @@ -148,33 +183,14 @@ class ArticleRepository: """ # Single writer at a time when using sqlite3 – avoids `database is locked` - async with self._lock: - row = self._row_for_url(url) + row = self._row_for_url(url) - if row: # row = (id, url, title, raw, processed) - LOGGER.info(f"[ArticleRepository] Found cached article for {url}") - return row[2], row[4] # processed_html already present + if row: # row = (id, url, title, raw, processed) + LOGGER.info(f"[ArticleRepository] Found cached article for {url}") + return row[2], row[4] # processed_html already present - LOGGER.info(f"[ArticleRepository] Downloading article for {url}") - title, raw_html = await PlaywrightPool.fetch_html(url) - processed_html = process_html(raw_html) - - async with self._lock: - # Upsert: - self._conn.execute( - f""" - INSERT INTO articles (url, title, raw_html, processed_html) - VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type}) - ON CONFLICT(url) DO UPDATE SET - title=EXCLUDED.title, - raw_html=EXCLUDED.raw_html, - processed_html=EXCLUDED.processed_html - """, - (url, title, raw_html, processed_html), - ) - self._conn.commit() - - return title, processed_html + LOGGER.info(f"[ArticleRepository] Article was not found for {url}") + return None async def has_paragraphs(self, url) -> bool: async with self._lock: @@ -190,6 +206,16 @@ class ArticleRepository: return False return True + def get_latest_articles(self, count): + try: + self._lock.acquire() + cur = self._conn.cursor() + row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,)) + + return row.fetchall() + finally: + self._lock.release() + async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings): async with self._lock: article_id = self._row_for_url(url)[0] diff --git a/news/server.py b/news/server.py index adb2f1c..6afb151 100644 --- a/news/server.py +++ b/news/server.py @@ -1,5 +1,6 @@ from flask import Flask, request, jsonify, abort from pathlib import Path +import logging # Import the repository class from the existing code base. # Adjust the relative import path if pool.py lives in a package. @@ -9,12 +10,10 @@ app = Flask(__name__) article_repository = ArticleRepository() +LOGGER = logging.getLogger("server") + @app.route("/articles/", methods=["GET"]) def get_article(article_url: str): - """ - Fetch one article by its numeric primary key. - Responds with the whole row in JSON or 404 if not present. - """ article = article_repository.get_article(article_url) if article is None: abort(404, description="Article not found") @@ -22,16 +21,12 @@ def get_article(article_url: str): @app.route("/article-by-url", methods=["GET"]) def get_article_by_url(): - """ - Same as above but lets a client specify the canonical URL instead of the ID: - - GET /article-by-url?url=https://example.com/foo - """ url = request.args.get("url") if not url: abort(400, description="`url` query parameter is required") - article = await article_repository.get_article(url) + LOGGER.info(f"Fetching article by URL: {url}") + article = article_repository.get_article(url) if article is None: abort(404, description="Article not found") return jsonify(article)