screw it i think im done with this.

2025-07-03 21:33:49 -04:00 · 2025-07-03 21:33:49 -04:00 · 301483810e
parent 329ffc8c8c
commit 301483810e
5 changed files with 62 additions and 41 deletions
--- a/news/pycache/pool.cpython-311.pyc
+++ b/news/pycache/pool.cpython-311.pyc
--- a/news/pycache/server.cpython-311.pyc
+++ b/news/pycache/server.cpython-311.pyc
--- a/news/main.py
+++ b/news/main.py
@ -196,7 +196,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
    LOGGER.info("Received URL from %s: %s", message.author, url)

    try:
-        title, processed_html = await server.article_repository.get_article(url)
+        title, processed_html = await server.article_repository.get_article_async(url)

        if await server.article_repository.has_paragraphs(url):
            await message.channel.send("This article has already been processed.")
--- a/news/pool.py
+++ b/news/pool.py
@ -15,10 +15,6 @@ def process_html(html):
                        include_tables=True, include_comments=False, favor_recall=True)

 LOGGER = logging.getLogger("pool")
-# logging.basicConfig(
-#     level=logging.INFO,
-#     format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
-# )

 class PlaywrightPool:
    _pw = None           # playwright instance
@ -140,7 +136,46 @@ class ArticleRepository:
    # ------------------------------------------------------------------ #
    # public API
    # ------------------------------------------------------------------ #
-    async def get_article(self, url: str) -> tuple[str, str]:
+
+    async def get_article_async(self, url: str) -> tuple[str, str]:
+        async with self._lock:
+            result = self._get_article(url)
+            if result:
+                return result
+
+            LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
+            title, raw_html = await PlaywrightPool.fetch_html(url)
+            processed_html = process_html(raw_html)
+
+            # Upsert:
+            self._conn.execute(
+                f"""
+                        INSERT INTO articles (url, title, raw_html, processed_html)
+                        VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
+                        ON CONFLICT(url) DO UPDATE SET
+                            title=EXCLUDED.title,
+                            raw_html=EXCLUDED.raw_html,
+                            processed_html=EXCLUDED.processed_html
+                        """,
+                (url, title, raw_html, processed_html),
+            )
+            self._conn.commit()
+
+            return title, processed_html
+
+    def get_article(self, url: str) -> tuple[str, str] | None:
+        try:
+            self._lock.acquire()
+            return self._get_article(url)
+        except Exception as exc:
+            LOGGER.exception(f"[ArticleRepository] Error while getting article for {url}")
+            LOGGER.exception(exc)
+            return None
+        finally:
+            if self._lock.locked():
+                self._lock.release()
+
+    def _get_article(self, url: str) -> tuple[str, str] | None:
        """
        Main entry point.
        • Returns the processed text if it is already cached.
@ -148,33 +183,14 @@ class ArticleRepository:
        """

        # Single writer at a time when using sqlite3 – avoids `database is locked`
-        async with self._lock:
-            row = self._row_for_url(url)
+        row = self._row_for_url(url)

-            if row:                          # row = (id, url, title, raw, processed)
-                LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
-                return row[2], row[4]                           # processed_html already present
+        if row:                          # row = (id, url, title, raw, processed)
+            LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
+            return row[2], row[4]                           # processed_html already present

-        LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
-        title, raw_html = await PlaywrightPool.fetch_html(url)
-        processed_html = process_html(raw_html)
-
-        async with self._lock:
-            # Upsert:
-            self._conn.execute(
-                f"""
-                INSERT INTO articles (url, title, raw_html, processed_html)
-                VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
-                ON CONFLICT(url) DO UPDATE SET
-                    title=EXCLUDED.title,
-                    raw_html=EXCLUDED.raw_html,
-                    processed_html=EXCLUDED.processed_html
-                """,
-                (url, title, raw_html, processed_html),
-            )
-            self._conn.commit()
-
-        return title, processed_html
+        LOGGER.info(f"[ArticleRepository] Article was not found for {url}")
+        return None

    async def has_paragraphs(self, url) -> bool:
        async with self._lock:
@ -190,6 +206,16 @@ class ArticleRepository:
                return False
            return True

+    def get_latest_articles(self, count):
+        try:
+            self._lock.acquire()
+            cur = self._conn.cursor()
+            row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,))
+
+            return row.fetchall()
+        finally:
+            self._lock.release()
+
    async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings):
        async with self._lock:
            article_id = self._row_for_url(url)[0]
--- a/news/server.py
+++ b/news/server.py
@ -1,5 +1,6 @@
 from flask import Flask, request, jsonify, abort
 from pathlib import Path
+import logging

 # Import the repository class from the existing code base.
 # Adjust the relative import path if pool.py lives in a package.
@ -9,12 +10,10 @@ app = Flask(__name__)

 article_repository = ArticleRepository()

+LOGGER = logging.getLogger("server")
+
@app.route("/articles/<article_url>", methods=["GET"])
 def get_article(article_url: str):
-    """
-    Fetch one article by its numeric primary key.
-    Responds with the whole row in JSON or 404 if not present.
-    """
    article = article_repository.get_article(article_url)
    if article is None:
        abort(404, description="Article not found")
@ -22,16 +21,12 @@ def get_article(article_url: str):

@app.route("/article-by-url", methods=["GET"])
 def get_article_by_url():
-    """
-    Same as above but lets a client specify the canonical URL instead of the ID:
-
-    GET /article-by-url?url=https://example.com/foo
-    """
    url = request.args.get("url")
    if not url:
        abort(400, description="`url` query parameter is required")

-    article = await article_repository.get_article(url)
+    LOGGER.info(f"Fetching article by URL: {url}")
+    article = article_repository.get_article(url)
    if article is None:
        abort(404, description="Article not found")
    return jsonify(article)