super

2025-07-03 13:46:11 -04:00 · 2025-07-03 13:46:11 -04:00 · 329ffc8c8c
parent eb7ee3054d
commit 329ffc8c8c
6 changed files with 68 additions and 15 deletions
--- a/news/pycache/pool.cpython-312.pyc
+++ b/news/pycache/pool.cpython-312.pyc
--- a/news/pycache/server.cpython-312.pyc
+++ b/news/pycache/server.cpython-312.pyc
--- a/news/main.py
+++ b/news/main.py
@ -20,6 +20,10 @@ from ollama import Client
 from ollama import AsyncClient
 import time
 import json
+import server
+import threading
+from typing import NoReturn
+

 load_dotenv()

@ -39,8 +43,6 @@ logging.basicConfig(
    datefmt="%Y-%m-%d %H:%M:%S"
 )

-article_repository = ArticleRepository()
-
 social_system_prompt = ("You are a specialized analysis program designed to determine if articles are pro-social. "
                        "Pro-social text contains topics such as raising concerns about the negative effects on workers, the environment, "
                        "or on society as a whole (as in the concerns of the 99%, or the proletariat). "
@ -194,9 +196,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
    LOGGER.info("Received URL from %s: %s", message.author, url)

    try:
-        title, processed_html = await article_repository.get_article(url)
+        title, processed_html = await server.article_repository.get_article(url)

-        if await article_repository.has_paragraphs(url):
+        if await server.article_repository.has_paragraphs(url):
            await message.channel.send("This article has already been processed.")
            LOGGER.info(f"Article {url} already processed")
            return
@ -213,15 +215,19 @@ async def handle_article_url(message: discord.Message, url: str) -> None:

        summary_bot.set_system("You are a specialized analysis program designed to summarize articles into their key points.\n "
                               "You WILL output as many key points as possible, but you MUST output at least 1 key point.\n"
-                               "You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. ")
+                               "You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. "
+                               "DO NOT OUTPUT ANYTHING BUT THE PROPERLY FORMATTED JSON.")

        try:
-            keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in await summary_bot.multi_summary(processed_html, options={
+            silly = await summary_bot.multi_summary(processed_html, options={
                "temperature": 0.5,
                "num_ctx": 4096
-            })) for item in sublist]
+            }, format="json")
+            keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in silly) for item in sublist]
            LOGGER.info(keywords)
        except Exception as exc:
+            for sil in silly:
+                LOGGER.error(sil.content())
            LOGGER.error("Failed to correctly parse LLM output. It is likely that it has failed.")
            LOGGER.error(exc, exc_info=True)
            keywords = []
@ -257,6 +263,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
                LOGGER.info(f"Running for keyword {keypoint} got response {keypoint_is_rev.content()}")
                restitutions.append(keypoint_is_rev.content())
            restitutions.append(res.content())
+            LOGGER.info(f"Restitutions: {restitutions}")
            yes, no, err = tally_responses(restitutions)
            total = yes + no + err
            paragraph_relevance.append(response.content())
@ -266,9 +273,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
        for i, x in enumerate(paragraph_relevance):
            paragraph_relevance[i] = int(x)

-        await article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions)
+        await server.article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions)

-        average_relevance = (sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance) + sum(paragraph_keypoints)) / 2
+        average_relevance = ((sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance)) + (sum(paragraph_keypoints) / len(paragraph_keypoints))) / 2
        median_relevance = sorted(int(ref) for ref in paragraph_relevance)[len(paragraph_relevance) // 2]
        median_relevance2 = sorted(paragraph_keypoints)[len(paragraph_keypoints) // 2]

@ -340,14 +347,22 @@ async def on_message(message: discord.Message) -> None:
    # Launch the processing task without blocking Discord’s event loop
    asyncio.create_task(handle_article_url(message, url))

+def _run_flask_blocking() -> NoReturn:  # helper returns never
+    server.app.run(host="0.0.0.0", port=8000, debug=False, use_reloader=False)
+
+
 def main() -> None:
    if DISCORD_TOKEN is None:
        raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")

+    thread = threading.Thread(target=_run_flask_blocking, daemon=True, name="flask-api")
+    thread.start()
+
    try:
        bot.run(DISCORD_TOKEN)
    finally:
        asyncio.run(PlaywrightPool.stop())
+        server.article_repository.close()

 if __name__ == "__main__":
    main()
--- a/news/pool.py
+++ b/news/pool.py
@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple
 import logging

 def process_html(html):
-    return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True,
+    return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
                        include_tables=True, include_comments=False, favor_recall=True)

 LOGGER = logging.getLogger("pool")
@ -186,7 +186,7 @@ class ArticleRepository:
            row = cur.execute(f"SELECT COUNT(*) FROM summaries WHERE article_id = {row[0]}")

            result = row.fetchone()
-            if not row.fetchone() or row.fetchone()[0] == 0:
+            if not result or result[0] == 0:
                return False
            return True

@ -209,10 +209,10 @@ class ArticleRepository:
            topic_ids = []
            for topic in topics:
                rows = cur.execute(f"""
-                    INSERT INTO topics (article_id, topic_text, type) 
-                    VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
+                    INSERT INTO topics (article_id, topic_text) 
+                    VALUES ({self.cursor_type}, {self.cursor_type})
                    RETURNING id;
-                """, (article_id, topic, "keypoint"))
+                """, (article_id, topic))
                topic_ids.append(rows.fetchone()[0])

            for paragraph, summary_rating, gel in zip(paragraphs, summary_ratings, topic_ratings):
--- a/news/server.py
+++ b/news/server.py
@ -0,0 +1,37 @@
+from flask import Flask, request, jsonify, abort
+from pathlib import Path
+
+# Import the repository class from the existing code base.
+# Adjust the relative import path if pool.py lives in a package.
+from pool import ArticleRepository
+
+app = Flask(__name__)
+
+article_repository = ArticleRepository()
+
+@app.route("/articles/<article_url>", methods=["GET"])
+def get_article(article_url: str):
+    """
+    Fetch one article by its numeric primary key.
+    Responds with the whole row in JSON or 404 if not present.
+    """
+    article = article_repository.get_article(article_url)
+    if article is None:
+        abort(404, description="Article not found")
+    return jsonify(article)
+
+@app.route("/article-by-url", methods=["GET"])
+def get_article_by_url():
+    """
+    Same as above but lets a client specify the canonical URL instead of the ID:
+
+    GET /article-by-url?url=https://example.com/foo
+    """
+    url = request.args.get("url")
+    if not url:
+        abort(400, description="`url` query parameter is required")
+
+    article = await article_repository.get_article(url)
+    if article is None:
+        abort(404, description="Article not found")
+    return jsonify(article)
--- a/news/shell.nix
+++ b/news/shell.nix
@ -9,6 +9,7 @@ in pkgs.mkShell {
      python-dotenv
      trafilatura
      playwright
+	  flask
    ]))
  ];
  propagatedBuildInputs = with pkgs; [
@ -48,4 +49,4 @@ in pkgs.mkShell {
      export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers}
      export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true
    '';
-}
+}