From fdffd314fcb5995ef36fcb8d11ba7306bfe25131 Mon Sep 17 00:00:00 2001
From: Brett Laptop <tri11paragon@tpgc.me>
Date: Sun, 29 Jun 2025 21:32:23 -0400
Subject: [PATCH] sorta working relevance

---
 news/__pycache__/pool.cpython-311.pyc | Bin 13417 -> 13412 bytes
 news/main.py                          | 146 +++++++++++++++++++++-----
 news/pool.py                          |   2 +-
 3 files changed, 120 insertions(+), 28 deletions(-)

diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc
index 267ca6450f4027e3cf946e59ec93e4427602484c..dde08c7efd3e1e2ef31a85a8881c493adbbf0360 100644
GIT binary patch
delta 43
ycmaEv@g#$HIWI340}$k$Ow72xk@q$u3v)?D$>fiWPuNUPh+IfY+kBg;&kz7zgb=j=

delta 48
zcmaEo@iK#VIWI340}w=?jmx;Rk@q$uCr55#QFcmxdEVqVj8E9CPNZB&&)$5Esm~Aq
Dv#Jvc

diff --git a/news/main.py b/news/main.py
index 8b4abd2..bdb4669 100644
--- a/news/main.py
+++ b/news/main.py
@@ -4,7 +4,10 @@ import asyncio
 import collections
 import logging
 import os
-from typing import Final, Optional, List
+from pathlib import Path
+from typing import Final, Optional, List, NamedTuple
+from dataclasses import dataclass
+from textwrap import wrap
 
 import discord
 from dotenv import load_dotenv
@@ -62,6 +65,36 @@ facts_system_prompt = ("You are a specialized analysis program designed to deter
                        "If the article only presents opinions about genocide, then it is not accurately representing what happened). "
                        "You WILL give rating of this article by calling the increment tool if you read a paragraph (seperated by newlines) which is accurately representing facts, and decrement if it is not.")
 
+summary_system_prompt = ("You are a specialized analysis program designed to summarize articles. "
+                         "You WILL be given an article and you WILL output a summary of the article in less than 300 words. "
+                         "Do NOT include that you are attempting to meet the word count in your response.")
+
+relevance_system_prompt = ("You are a specialized analysis program designed to determine if a paragraph is relevant to the topic of the article. "
+                           "You will be given a summary of the article and a paragraph of text. "
+                           "You WILL respond with number between 0 and 100 indicating how relevant the paragraph is to the article."
+                           "You WILL NOT output anything else besides the number. "
+                           "An example response to a given input would be:\n "
+                           "-----\n"
+                           "Summary:\n "
+                           "The president of the United States has been in the White House for 20 years.\n"
+                           "-----\n"
+                           "Paragraph:\n"
+                           "\"The president of the United States has been in the White House for 20 years.\"\n"
+                           "-----\n"
+                           "Your response to this would then look like:\n"
+                           "100")
+
+@dataclass(frozen=True)
+class Response:
+    response: ChatResponse
+
+    def content(self):
+        return self.response["message"]["content"]
+
+    def tools(self):
+        return self.response["message"]["tool_calls"]
+
+
 class ChatBot:
     def __init__(self, system : str, host : str="192.168.69.3:11434"):
         self.client = AsyncClient(host=host)
@@ -70,18 +103,36 @@ class ChatBot:
         self.model = "llama3.2:3b"
         self.clear()
 
-    async def send_message(self, message : str):
+    async def send_message(self, message : str, **kwargs) -> Response:
         self.messages.append({"role": "user", "content": message})
         response = await self.client.chat(
             model=self.model,
             messages=self.messages,
-            stream=False)
+            stream=False,
+            **kwargs)
         self.messages.append({"role": "assistant", "content": response["message"]["content"]})
+        return Response(response)
+
+    async def single_chat(self, message : str, **kwargs) -> Response:
+        messages = [{"role": "system", "content": self.system}, {"role": "user", "content": message}]
+        return Response(await self.client.chat(
+            model=self.model,
+            messages=messages,
+            stream=False,
+            **kwargs
+        ))
+
+    async def multi_summary(self, message: str, **kwargs) -> list[Response]:
+        chunks = wrap(message, width=(4096 - len(self.system) - 64))
+        responses = []
+        for chunk in chunks:
+            responses.append(await self.single_chat(chunk, **kwargs))
+        return responses
 
     async def set_model(self, model : str):
         self.model = model
 
-    async def clear(self):
+    def clear(self):
         self.messages = []
         self.messages.append({"role": "system", "content": self.system})
 
@@ -101,10 +152,18 @@ async def send_chat_with_system(model, message, system, tools = None):
     messages = [{'role': 'system', 'content': system}, {'role': 'user', 'content': message}]
     return await send_chat(model, messages, tools)
 
-async def send_text_file(channel: discord.abc.Messageable, content: str | collections.abc.Sequence[str], message: str = "📄 Full article attached:", filename: str = "article.md") -> None:
-    fp = io.BytesIO(content.encode("utf-8"))
-    file = discord.File(fp, filename=filename)
-    await channel.send(message, file=file)
+async def send_text_file(channel: discord.abc.Messageable, *args: str | tuple[str,str], message: str = "📄 Full article attached:") -> None:
+    strings = []
+    names = []
+    for i, arg in enumerate(args):
+        if isinstance(arg, tuple):
+            strings.append(arg[1])
+            names.append(arg[0])
+        else:
+            strings.append(arg)
+            names.append("Unnamed_file_" + str(i) + ".txt")
+    files = [discord.File(io.BytesIO(text.encode("utf-8")), filename=name) for name, text in zip(names, strings)]
+    await channel.send(message, files=files)
 
 def tally_responses(tools):
     increment = 0
@@ -153,35 +212,68 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
             }
         ]
 
-        social = await send_chat_with_system("social", processed_html, social_system_prompt, tools)
-        capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools)
-        facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools)
+        summary_bot = ChatBot(summary_system_prompt)
 
-        print(social)
-        print(capital)
-        print(facts)
+        summary_parts = await summary_bot.multi_summary(processed_html, options={
+            "temperature": 0.5,
+            "num_predict": 300,
+            "num_ctx": 4096
+        })
 
-        social_increment, social_decrement = tally_responses(social['message']["tool_calls"])
-        capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"])
-        facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"])
+        summary_parts_string = [part.content() for part in summary_parts]
+
+        summary = "\nSummary: ".join(summary_parts_string)
+
+        paragraphs = [para for para in processed_html.split("\n") if len(para.strip()) > 0]
+
+        relevance_bot = ChatBot(relevance_system_prompt)
+
+        paragraph_relevance = []
+
+        for paragraph in paragraphs:
+            response = await relevance_bot.single_chat("".join(["-----\n",
+                                               "Summary:\n ",
+                                               summary, "-----\n",
+                                               "Paragraph:\n ",
+                                               paragraph, "-----\n"]))
+            paragraph_relevance.append(response.content())
+
+        for i, x in enumerate(paragraph_relevance):
+            paragraph_relevance[i] = str(int(x))
+
+        average_relevance = sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance)
+        median_relevance = int(sorted(paragraph_relevance)[len(paragraph_relevance) // 2])
+
+        relevance_cutoff = min(average_relevance, median_relevance)
+        LOGGER.info(f"Relevance cutoff: {relevance_cutoff}")
+
+        relevance_content = [para + " (" + res + "%)" for para, res in zip(paragraphs, paragraph_relevance) if int(res) >= relevance_cutoff]
+        relevance_prompt = "\n\n".join(relevance_content)
+
+        # social = await send_chat_with_system("social", processed_html, social_system_prompt, tools)
+        # capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools)
+        # facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools)
+
+        # print(social)
+        # print(capital)
+        # print(facts)
+
+        # social_increment, social_decrement = tally_responses(social['message']["tool_calls"])
+        # capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"])
+        # facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"])
 
         # TODO: parse `html`, summarise, etc.
         await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
+        # time.sleep(0.1)
+        # await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}")
         time.sleep(0.1)
-        await send_text_file(message.channel, processed_html)
-        time.sleep(0.1)
-        await send_text_file(message.channel, social["message"]["content"], "Social calculations:")
-        time.sleep(0.1)
-        await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}")
-        time.sleep(0.1)
-        await send_text_file(message.channel, capital["message"]["content"], "capital calculations:")
-        time.sleep(0.1)
-        await send_text_file(message.channel, facts["message"]["content"], "facts calculations:")
+        # await send_text_file(message.channel, [processed_html, summary, social["message"]["content"], capital["message"]["content"], facts["message"]["content"]], "Files")
+        await send_text_file(message.channel, ("Raw Article.txt", processed_html), ("Summary.txt", summary), ("Paragraph Relevance.txt", relevance_prompt), message="Files")
         time.sleep(0.1)
     except Exception as exc:
         await message.channel.send("❌ Sorry, an internal error has occurred. Please try again later or contact an administrator.")
-        await message.channel.send(f"```\n{exc}\n```")
         LOGGER.error(exc, exc_info=True)
+        await message.channel.send(f"```\n{exc}\n```")
 
 
 def extract_first_url(text: str) -> Optional[str]:
diff --git a/news/pool.py b/news/pool.py
index d56fac9..00e7232 100644
--- a/news/pool.py
+++ b/news/pool.py
@@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple
 import logging
 
 def process_html(html):
-    return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
+    return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True,
                         include_tables=True, include_comments=False, favor_recall=True)
 
 LOGGER = logging.getLogger("pool")