sorta working relevance

2025-06-29 21:32:23 -04:00 · 2025-06-29 21:32:23 -04:00 · fdffd314fc
parent 1dafa31a60
commit fdffd314fc
3 changed files with 120 additions and 28 deletions
--- a/news/pycache/pool.cpython-311.pyc
+++ b/news/pycache/pool.cpython-311.pyc
--- a/news/main.py
+++ b/news/main.py
@ -4,7 +4,10 @@ import asyncio
 import collections
 import logging
 import os
-from typing import Final, Optional, List
+from pathlib import Path
+from typing import Final, Optional, List, NamedTuple
+from dataclasses import dataclass
+from textwrap import wrap

 import discord
 from dotenv import load_dotenv
@ -62,6 +65,36 @@ facts_system_prompt = ("You are a specialized analysis program designed to deter
                       "If the article only presents opinions about genocide, then it is not accurately representing what happened). "
                       "You WILL give rating of this article by calling the increment tool if you read a paragraph (seperated by newlines) which is accurately representing facts, and decrement if it is not.")

+summary_system_prompt = ("You are a specialized analysis program designed to summarize articles. "
+                         "You WILL be given an article and you WILL output a summary of the article in less than 300 words. "
+                         "Do NOT include that you are attempting to meet the word count in your response.")
+
+relevance_system_prompt = ("You are a specialized analysis program designed to determine if a paragraph is relevant to the topic of the article. "
+                           "You will be given a summary of the article and a paragraph of text. "
+                           "You WILL respond with number between 0 and 100 indicating how relevant the paragraph is to the article."
+                           "You WILL NOT output anything else besides the number. "
+                           "An example response to a given input would be:\n "
+                           "-----\n"
+                           "Summary:\n "
+                           "The president of the United States has been in the White House for 20 years.\n"
+                           "-----\n"
+                           "Paragraph:\n"
+                           "\"The president of the United States has been in the White House for 20 years.\"\n"
+                           "-----\n"
+                           "Your response to this would then look like:\n"
+                           "100")
+
+@dataclass(frozen=True)
+class Response:
+    response: ChatResponse
+
+    def content(self):
+        return self.response["message"]["content"]
+
+    def tools(self):
+        return self.response["message"]["tool_calls"]
+
+
 class ChatBot:
    def __init__(self, system : str, host : str="192.168.69.3:11434"):
        self.client = AsyncClient(host=host)
@ -70,18 +103,36 @@ class ChatBot:
        self.model = "llama3.2:3b"
        self.clear()

-    async def send_message(self, message : str):
+    async def send_message(self, message : str, **kwargs) -> Response:
        self.messages.append({"role": "user", "content": message})
        response = await self.client.chat(
            model=self.model,
            messages=self.messages,
-            stream=False)
+            stream=False,
+            **kwargs)
        self.messages.append({"role": "assistant", "content": response["message"]["content"]})
+        return Response(response)
+
+    async def single_chat(self, message : str, **kwargs) -> Response:
+        messages = [{"role": "system", "content": self.system}, {"role": "user", "content": message}]
+        return Response(await self.client.chat(
+            model=self.model,
+            messages=messages,
+            stream=False,
+            **kwargs
+        ))
+
+    async def multi_summary(self, message: str, **kwargs) -> list[Response]:
+        chunks = wrap(message, width=(4096 - len(self.system) - 64))
+        responses = []
+        for chunk in chunks:
+            responses.append(await self.single_chat(chunk, **kwargs))
+        return responses

    async def set_model(self, model : str):
        self.model = model

-    async def clear(self):
+    def clear(self):
        self.messages = []
        self.messages.append({"role": "system", "content": self.system})

@ -101,10 +152,18 @@ async def send_chat_with_system(model, message, system, tools = None):
    messages = [{'role': 'system', 'content': system}, {'role': 'user', 'content': message}]
    return await send_chat(model, messages, tools)

-async def send_text_file(channel: discord.abc.Messageable, content: str | collections.abc.Sequence[str], message: str = "📄 Full article attached:", filename: str = "article.md") -> None:
-    fp = io.BytesIO(content.encode("utf-8"))
-    file = discord.File(fp, filename=filename)
-    await channel.send(message, file=file)
+async def send_text_file(channel: discord.abc.Messageable, *args: str | tuple[str,str], message: str = "📄 Full article attached:") -> None:
+    strings = []
+    names = []
+    for i, arg in enumerate(args):
+        if isinstance(arg, tuple):
+            strings.append(arg[1])
+            names.append(arg[0])
+        else:
+            strings.append(arg)
+            names.append("Unnamed_file_" + str(i) + ".txt")
+    files = [discord.File(io.BytesIO(text.encode("utf-8")), filename=name) for name, text in zip(names, strings)]
+    await channel.send(message, files=files)

 def tally_responses(tools):
    increment = 0
@ -153,35 +212,68 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
            }
        ]

-        social = await send_chat_with_system("social", processed_html, social_system_prompt, tools)
-        capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools)
-        facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools)
+        summary_bot = ChatBot(summary_system_prompt)

-        print(social)
-        print(capital)
-        print(facts)
+        summary_parts = await summary_bot.multi_summary(processed_html, options={
+            "temperature": 0.5,
+            "num_predict": 300,
+            "num_ctx": 4096
+        })

-        social_increment, social_decrement = tally_responses(social['message']["tool_calls"])
-        capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"])
-        facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"])
+        summary_parts_string = [part.content() for part in summary_parts]
+
+        summary = "\nSummary: ".join(summary_parts_string)
+
+        paragraphs = [para for para in processed_html.split("\n") if len(para.strip()) > 0]
+
+        relevance_bot = ChatBot(relevance_system_prompt)
+
+        paragraph_relevance = []
+
+        for paragraph in paragraphs:
+            response = await relevance_bot.single_chat("".join(["-----\n",
+                                               "Summary:\n ",
+                                               summary, "-----\n",
+                                               "Paragraph:\n ",
+                                               paragraph, "-----\n"]))
+            paragraph_relevance.append(response.content())
+
+        for i, x in enumerate(paragraph_relevance):
+            paragraph_relevance[i] = str(int(x))
+
+        average_relevance = sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance)
+        median_relevance = int(sorted(paragraph_relevance)[len(paragraph_relevance) // 2])
+
+        relevance_cutoff = min(average_relevance, median_relevance)
+        LOGGER.info(f"Relevance cutoff: {relevance_cutoff}")
+
+        relevance_content = [para + " (" + res + "%)" for para, res in zip(paragraphs, paragraph_relevance) if int(res) >= relevance_cutoff]
+        relevance_prompt = "\n\n".join(relevance_content)
+
+        # social = await send_chat_with_system("social", processed_html, social_system_prompt, tools)
+        # capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools)
+        # facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools)
+
+        # print(social)
+        # print(capital)
+        # print(facts)
+
+        # social_increment, social_decrement = tally_responses(social['message']["tool_calls"])
+        # capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"])
+        # facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"])

        # TODO: parse `html`, summarise, etc.
        await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
+        # time.sleep(0.1)
+        # await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}")
        time.sleep(0.1)
-        await send_text_file(message.channel, processed_html)
-        time.sleep(0.1)
-        await send_text_file(message.channel, social["message"]["content"], "Social calculations:")
-        time.sleep(0.1)
-        await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}")
-        time.sleep(0.1)
-        await send_text_file(message.channel, capital["message"]["content"], "capital calculations:")
-        time.sleep(0.1)
-        await send_text_file(message.channel, facts["message"]["content"], "facts calculations:")
+        # await send_text_file(message.channel, [processed_html, summary, social["message"]["content"], capital["message"]["content"], facts["message"]["content"]], "Files")
+        await send_text_file(message.channel, ("Raw Article.txt", processed_html), ("Summary.txt", summary), ("Paragraph Relevance.txt", relevance_prompt), message="Files")
        time.sleep(0.1)
    except Exception as exc:
        await message.channel.send("❌ Sorry, an internal error has occurred. Please try again later or contact an administrator.")
-        await message.channel.send(f"```\n{exc}\n```")
        LOGGER.error(exc, exc_info=True)
+        await message.channel.send(f"```\n{exc}\n```")


 def extract_first_url(text: str) -> Optional[str]:
--- a/news/pool.py
+++ b/news/pool.py
@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple
 import logging

 def process_html(html):
-    return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
+    return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True,
                        include_tables=True, include_comments=False, favor_recall=True)

 LOGGER = logging.getLogger("pool")