From 329ffc8c8cd3cba664dfefe08c55d4f69aab3c27 Mon Sep 17 00:00:00 2001 From: Brett Date: Thu, 3 Jul 2025 13:46:11 -0400 Subject: [PATCH] super --- news/__pycache__/pool.cpython-312.pyc | Bin 18397 -> 18273 bytes news/__pycache__/server.cpython-312.pyc | Bin 0 -> 1699 bytes news/main.py | 33 +++++++++++++++------ news/pool.py | 10 +++---- news/server.py | 37 ++++++++++++++++++++++++ news/shell.nix | 3 +- 6 files changed, 68 insertions(+), 15 deletions(-) create mode 100644 news/__pycache__/server.cpython-312.pyc create mode 100644 news/server.py diff --git a/news/__pycache__/pool.cpython-312.pyc b/news/__pycache__/pool.cpython-312.pyc index 774bfad158df47b259f55ddef7aad7de04e94450..b30e1df8c611d24334116e5bef004277fd01ed4f 100644 GIT binary patch delta 515 zcmW-d!D|yi6o>QMY-T%~%^|e`TVsvXBx|~9LJ|?8BH38Gms(n>QiOt7st}^EPz4F} z(2Iv+53iV7P>WR*X~D7wt0LI-&;=EHRj5$B)Qc$j2b|4=`SH!i@R;{@7qE99kMF{XQZ+06j^K5${_M$#zFWrdHoX zXz65N{`TtVG2khUCF_yz2fCLotyCUPyga%*Nk5XsSq_3}u*G1NqZu0OI&3r!qKlAM@w6nyjOmWLF&uH!`Merv{ z5pS+3VzN$uJijz?<^ln<&N+`$ao{4oPmgf+DAZ6I+O$MDeFWn)t5=}VmOe)x^%DI} z_ka@}pMgsM(#7^YOqVk=LCm-4Tc)UL7i>v1zI#Qbs1YZ<{}(1{FB_v=R->%ZOQXiP z^V)bPp-#2jTUyIUd2%PFx43*T;|%AYqmmE{vY1$ql|XZ-*9r%>PlN?~0a#r2zsZ!iJ2BK~Qw N(x3AUY*KB={tvH)l3M@( delta 645 zcmY+BO=uHg5QcYlH@o@SO)JfhiQ2SD*Cb6yEw<^wUkhlBBCT-6j4MxST7pv&4c)TK@Seh%frhI%+Kd5@bN0h zFJ;-xkl2vYr>CCCuYl)r7jyUpwuJtwt#&G{*V%6#_BCL}?ZZ#r4|pMN$CA&;W*wM` zJ8;x*!sosdhSdRP{`yjT7eEO;%Gq7-+8o8$wOHZc!jXF;iz7=XONqJ?UGIFO9Kc)3 z{>us=s79f{sR|VBYAg59LTC+CGrw*i^fVD5W?7+bc^{|}U+$%TISSMtpnin~)yh{m zfwCE>N=>ykwN+5PV#NWfOR6|U%IzYl85haEZFFfG63ID+f$dTL?}|NG+WW#V#xS!I z)&fqv;qSx8{@r#lFXp8QG0i<-o1M(8DR04qY{Q@a9vDY&Ac0>4eOL?(g9%H17wK}- z1T#j1(_s0}lkit?x`mwwayO%o>1F2F2?mGvxKZnpLDuhf8v!P;5whd6ZZb%yT?lvN zW-^obJ(QB(8>c2`GUv~n%Uw$UtI^ZpA`5x^9oYzOdN+^0O{VI}RPE5Q2E!hQ9SsBD zWosB2XXk>l;;z~IwoUxbs9x_`hD3xsL$i@*Xb~tL4r?y1%2^0kt)MyiYKuVG2~=Hz z780u-(7aN$L!|5%wTQmfTQI|IeJHjiz%ur$Az0DJ)cXLBP)f$(raqMH0AZ0FE_(;F LgL&A%AA{LHD2l#h diff --git a/news/__pycache__/server.cpython-312.pyc b/news/__pycache__/server.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..50389b6d5a3301cb6121a0dcb98879ba9764a739 GIT binary patch literal 1699 zcmaJ>&2Jk;6rcUDz4gWpHH}KrN=%drVJXd~P^hR%5wx^`P$>yq%)wdjj^j;tcb1uP zf}>OzQHdfB^nlcgQ;(^VKY;^xE~%`luquKIF1-Z;m!5dDA8rx^Pue##?`!A1_j~h0 zrBX&fQ@GvvR!8VJdB_RQ3GI9j%T44WSMgDxG?YMXsDakd0==OtQm*=ipx7uTd(AHe zM#E5$iikoEY$$1vMj6h=MtVREla!anv%{{%)!o9PvRT|vQ=wZTmA2{{*Yu%h^*=lh z?DH&NLqng@o>?DxBtzEqxNyO@`IWfD$j4p6MO<3qH1yiNxL~&^6LlrlFWaI6>%rHV z@Eo7ak%)3nP}W;LY}u)dTW7Lev&;Omv3Bvz55T1q5YeG7kIh|LC#yk{ldP%+-_8QC zn@9jpX-P`=(HhF@cU4(a`V~>g4P3=lH?_P@3KFGUv!>)e^N{Z9av1|%Aj0Wj8WNn{ z3%7dM6C8)#fH2R&5%U6@_3#zan@K$82#;v!a=hY+4i+7PS31-un9&vNh4`KM_by?| z@X6Ve*lXjE3J5_s2}SDu=o?7-abCeh-=^Kry`#jYOSr?lNO&}ae$?_xv0<5GbCHN< zCTB&+jLF37Dl=h;OPPjClwkb$gw>$|v097>VV$QAq!bEnIn*Uq`X_FMWQALtuw}w# zq8_Wsn39da$1Pxe^w$U)oxES2zFVDM|7cKsrGN3qQnml-1EX@?*?RVcd&Y|o%-W4p z1M|q1eq;w+(CTOohvTi@alnBir8x~d{T)d5bUH(KZYbBKVP~I6-C-xMqBX=TPkQXC zuC}S~>b{YCEy)V)qko}jh!wZs7Wc%dbu+s7Zyy$gupg4iym?L_(Z`P1bjB1{e5SHst!j<0rvi1=yC zBA?hnu*b2SZjTX{jY39Drp2nj z>Ln)I66-cw z+Sk)xPTwz2+$~SsdTCIe?4SF|tY$cR8@6(l`@Px9gL$YzkEJ?cxmBwx%e`FKBjvW`9P{{B0m*>S58m{?_NTuRR=lZZP)z z7OMTK=wFO~Hoj#ZTkozf_Qwax%iF4<9KALFCxT^1L+YtSuzd)rh1B%u?e6WwbmkAW LlG<3wF1gb`Q}UFS literal 0 HcmV?d00001 diff --git a/news/main.py b/news/main.py index 8740e1d..0840374 100644 --- a/news/main.py +++ b/news/main.py @@ -20,6 +20,10 @@ from ollama import Client from ollama import AsyncClient import time import json +import server +import threading +from typing import NoReturn + load_dotenv() @@ -39,8 +43,6 @@ logging.basicConfig( datefmt="%Y-%m-%d %H:%M:%S" ) -article_repository = ArticleRepository() - social_system_prompt = ("You are a specialized analysis program designed to determine if articles are pro-social. " "Pro-social text contains topics such as raising concerns about the negative effects on workers, the environment, " "or on society as a whole (as in the concerns of the 99%, or the proletariat). " @@ -194,9 +196,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info("Received URL from %s: %s", message.author, url) try: - title, processed_html = await article_repository.get_article(url) + title, processed_html = await server.article_repository.get_article(url) - if await article_repository.has_paragraphs(url): + if await server.article_repository.has_paragraphs(url): await message.channel.send("This article has already been processed.") LOGGER.info(f"Article {url} already processed") return @@ -213,15 +215,19 @@ async def handle_article_url(message: discord.Message, url: str) -> None: summary_bot.set_system("You are a specialized analysis program designed to summarize articles into their key points.\n " "You WILL output as many key points as possible, but you MUST output at least 1 key point.\n" - "You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. ") + "You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. " + "DO NOT OUTPUT ANYTHING BUT THE PROPERLY FORMATTED JSON.") try: - keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in await summary_bot.multi_summary(processed_html, options={ + silly = await summary_bot.multi_summary(processed_html, options={ "temperature": 0.5, "num_ctx": 4096 - })) for item in sublist] + }, format="json") + keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in silly) for item in sublist] LOGGER.info(keywords) except Exception as exc: + for sil in silly: + LOGGER.error(sil.content()) LOGGER.error("Failed to correctly parse LLM output. It is likely that it has failed.") LOGGER.error(exc, exc_info=True) keywords = [] @@ -257,6 +263,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info(f"Running for keyword {keypoint} got response {keypoint_is_rev.content()}") restitutions.append(keypoint_is_rev.content()) restitutions.append(res.content()) + LOGGER.info(f"Restitutions: {restitutions}") yes, no, err = tally_responses(restitutions) total = yes + no + err paragraph_relevance.append(response.content()) @@ -266,9 +273,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None: for i, x in enumerate(paragraph_relevance): paragraph_relevance[i] = int(x) - await article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions) + await server.article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions) - average_relevance = (sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance) + sum(paragraph_keypoints)) / 2 + average_relevance = ((sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance)) + (sum(paragraph_keypoints) / len(paragraph_keypoints))) / 2 median_relevance = sorted(int(ref) for ref in paragraph_relevance)[len(paragraph_relevance) // 2] median_relevance2 = sorted(paragraph_keypoints)[len(paragraph_keypoints) // 2] @@ -340,14 +347,22 @@ async def on_message(message: discord.Message) -> None: # Launch the processing task without blocking Discord’s event loop asyncio.create_task(handle_article_url(message, url)) +def _run_flask_blocking() -> NoReturn: # helper returns never + server.app.run(host="0.0.0.0", port=8000, debug=False, use_reloader=False) + + def main() -> None: if DISCORD_TOKEN is None: raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.") + thread = threading.Thread(target=_run_flask_blocking, daemon=True, name="flask-api") + thread.start() + try: bot.run(DISCORD_TOKEN) finally: asyncio.run(PlaywrightPool.stop()) + server.article_repository.close() if __name__ == "__main__": main() diff --git a/news/pool.py b/news/pool.py index cd6e93b..78cdc62 100644 --- a/news/pool.py +++ b/news/pool.py @@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple import logging def process_html(html): - return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True, + return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool") @@ -186,7 +186,7 @@ class ArticleRepository: row = cur.execute(f"SELECT COUNT(*) FROM summaries WHERE article_id = {row[0]}") result = row.fetchone() - if not row.fetchone() or row.fetchone()[0] == 0: + if not result or result[0] == 0: return False return True @@ -209,10 +209,10 @@ class ArticleRepository: topic_ids = [] for topic in topics: rows = cur.execute(f""" - INSERT INTO topics (article_id, topic_text, type) - VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}) + INSERT INTO topics (article_id, topic_text) + VALUES ({self.cursor_type}, {self.cursor_type}) RETURNING id; - """, (article_id, topic, "keypoint")) + """, (article_id, topic)) topic_ids.append(rows.fetchone()[0]) for paragraph, summary_rating, gel in zip(paragraphs, summary_ratings, topic_ratings): diff --git a/news/server.py b/news/server.py new file mode 100644 index 0000000..adb2f1c --- /dev/null +++ b/news/server.py @@ -0,0 +1,37 @@ +from flask import Flask, request, jsonify, abort +from pathlib import Path + +# Import the repository class from the existing code base. +# Adjust the relative import path if pool.py lives in a package. +from pool import ArticleRepository + +app = Flask(__name__) + +article_repository = ArticleRepository() + +@app.route("/articles/", methods=["GET"]) +def get_article(article_url: str): + """ + Fetch one article by its numeric primary key. + Responds with the whole row in JSON or 404 if not present. + """ + article = article_repository.get_article(article_url) + if article is None: + abort(404, description="Article not found") + return jsonify(article) + +@app.route("/article-by-url", methods=["GET"]) +def get_article_by_url(): + """ + Same as above but lets a client specify the canonical URL instead of the ID: + + GET /article-by-url?url=https://example.com/foo + """ + url = request.args.get("url") + if not url: + abort(400, description="`url` query parameter is required") + + article = await article_repository.get_article(url) + if article is None: + abort(404, description="Article not found") + return jsonify(article) diff --git a/news/shell.nix b/news/shell.nix index b1d1be1..4b323cd 100644 --- a/news/shell.nix +++ b/news/shell.nix @@ -9,6 +9,7 @@ in pkgs.mkShell { python-dotenv trafilatura playwright + flask ])) ]; propagatedBuildInputs = with pkgs; [ @@ -48,4 +49,4 @@ in pkgs.mkShell { export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers} export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true ''; -} \ No newline at end of file +}