From fdffd314fcb5995ef36fcb8d11ba7306bfe25131 Mon Sep 17 00:00:00 2001 From: Brett Laptop Date: Sun, 29 Jun 2025 21:32:23 -0400 Subject: [PATCH] sorta working relevance --- news/__pycache__/pool.cpython-311.pyc | Bin 13417 -> 13412 bytes news/main.py | 146 +++++++++++++++++++++----- news/pool.py | 2 +- 3 files changed, 120 insertions(+), 28 deletions(-) diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 267ca6450f4027e3cf946e59ec93e4427602484c..dde08c7efd3e1e2ef31a85a8881c493adbbf0360 100644 GIT binary patch delta 43 ycmaEv@g#$HIWI340}$k$Ow72xk@q$u3v)?D$>fiWPuNUPh+IfY+kBg;&kz7zgb=j= delta 48 zcmaEo@iK#VIWI340}w=?jmx;Rk@q$uCr55#QFcmxdEVqVj8E9CPNZB&&)$5Esm~Aq Dv#Jvc diff --git a/news/main.py b/news/main.py index 8b4abd2..bdb4669 100644 --- a/news/main.py +++ b/news/main.py @@ -4,7 +4,10 @@ import asyncio import collections import logging import os -from typing import Final, Optional, List +from pathlib import Path +from typing import Final, Optional, List, NamedTuple +from dataclasses import dataclass +from textwrap import wrap import discord from dotenv import load_dotenv @@ -62,6 +65,36 @@ facts_system_prompt = ("You are a specialized analysis program designed to deter "If the article only presents opinions about genocide, then it is not accurately representing what happened). " "You WILL give rating of this article by calling the increment tool if you read a paragraph (seperated by newlines) which is accurately representing facts, and decrement if it is not.") +summary_system_prompt = ("You are a specialized analysis program designed to summarize articles. " + "You WILL be given an article and you WILL output a summary of the article in less than 300 words. " + "Do NOT include that you are attempting to meet the word count in your response.") + +relevance_system_prompt = ("You are a specialized analysis program designed to determine if a paragraph is relevant to the topic of the article. " + "You will be given a summary of the article and a paragraph of text. " + "You WILL respond with number between 0 and 100 indicating how relevant the paragraph is to the article." + "You WILL NOT output anything else besides the number. " + "An example response to a given input would be:\n " + "-----\n" + "Summary:\n " + "The president of the United States has been in the White House for 20 years.\n" + "-----\n" + "Paragraph:\n" + "\"The president of the United States has been in the White House for 20 years.\"\n" + "-----\n" + "Your response to this would then look like:\n" + "100") + +@dataclass(frozen=True) +class Response: + response: ChatResponse + + def content(self): + return self.response["message"]["content"] + + def tools(self): + return self.response["message"]["tool_calls"] + + class ChatBot: def __init__(self, system : str, host : str="192.168.69.3:11434"): self.client = AsyncClient(host=host) @@ -70,18 +103,36 @@ class ChatBot: self.model = "llama3.2:3b" self.clear() - async def send_message(self, message : str): + async def send_message(self, message : str, **kwargs) -> Response: self.messages.append({"role": "user", "content": message}) response = await self.client.chat( model=self.model, messages=self.messages, - stream=False) + stream=False, + **kwargs) self.messages.append({"role": "assistant", "content": response["message"]["content"]}) + return Response(response) + + async def single_chat(self, message : str, **kwargs) -> Response: + messages = [{"role": "system", "content": self.system}, {"role": "user", "content": message}] + return Response(await self.client.chat( + model=self.model, + messages=messages, + stream=False, + **kwargs + )) + + async def multi_summary(self, message: str, **kwargs) -> list[Response]: + chunks = wrap(message, width=(4096 - len(self.system) - 64)) + responses = [] + for chunk in chunks: + responses.append(await self.single_chat(chunk, **kwargs)) + return responses async def set_model(self, model : str): self.model = model - async def clear(self): + def clear(self): self.messages = [] self.messages.append({"role": "system", "content": self.system}) @@ -101,10 +152,18 @@ async def send_chat_with_system(model, message, system, tools = None): messages = [{'role': 'system', 'content': system}, {'role': 'user', 'content': message}] return await send_chat(model, messages, tools) -async def send_text_file(channel: discord.abc.Messageable, content: str | collections.abc.Sequence[str], message: str = "📄 Full article attached:", filename: str = "article.md") -> None: - fp = io.BytesIO(content.encode("utf-8")) - file = discord.File(fp, filename=filename) - await channel.send(message, file=file) +async def send_text_file(channel: discord.abc.Messageable, *args: str | tuple[str,str], message: str = "📄 Full article attached:") -> None: + strings = [] + names = [] + for i, arg in enumerate(args): + if isinstance(arg, tuple): + strings.append(arg[1]) + names.append(arg[0]) + else: + strings.append(arg) + names.append("Unnamed_file_" + str(i) + ".txt") + files = [discord.File(io.BytesIO(text.encode("utf-8")), filename=name) for name, text in zip(names, strings)] + await channel.send(message, files=files) def tally_responses(tools): increment = 0 @@ -153,35 +212,68 @@ async def handle_article_url(message: discord.Message, url: str) -> None: } ] - social = await send_chat_with_system("social", processed_html, social_system_prompt, tools) - capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools) - facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools) + summary_bot = ChatBot(summary_system_prompt) - print(social) - print(capital) - print(facts) + summary_parts = await summary_bot.multi_summary(processed_html, options={ + "temperature": 0.5, + "num_predict": 300, + "num_ctx": 4096 + }) - social_increment, social_decrement = tally_responses(social['message']["tool_calls"]) - capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"]) - facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"]) + summary_parts_string = [part.content() for part in summary_parts] + + summary = "\nSummary: ".join(summary_parts_string) + + paragraphs = [para for para in processed_html.split("\n") if len(para.strip()) > 0] + + relevance_bot = ChatBot(relevance_system_prompt) + + paragraph_relevance = [] + + for paragraph in paragraphs: + response = await relevance_bot.single_chat("".join(["-----\n", + "Summary:\n ", + summary, "-----\n", + "Paragraph:\n ", + paragraph, "-----\n"])) + paragraph_relevance.append(response.content()) + + for i, x in enumerate(paragraph_relevance): + paragraph_relevance[i] = str(int(x)) + + average_relevance = sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance) + median_relevance = int(sorted(paragraph_relevance)[len(paragraph_relevance) // 2]) + + relevance_cutoff = min(average_relevance, median_relevance) + LOGGER.info(f"Relevance cutoff: {relevance_cutoff}") + + relevance_content = [para + " (" + res + "%)" for para, res in zip(paragraphs, paragraph_relevance) if int(res) >= relevance_cutoff] + relevance_prompt = "\n\n".join(relevance_content) + + # social = await send_chat_with_system("social", processed_html, social_system_prompt, tools) + # capital = await send_chat_with_system("capital", processed_html, capital_system_prompt, tools) + # facts = await send_chat_with_system("facts", processed_html, facts_system_prompt, tools) + + # print(social) + # print(capital) + # print(facts) + + # social_increment, social_decrement = tally_responses(social['message']["tool_calls"]) + # capital_increment, capital_decrement = tally_responses(capital['message']["tool_calls"]) + # facts_increment, facts_decrement = tally_responses(facts['message']["tool_calls"]) # TODO: parse `html`, summarise, etc. await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.") + # time.sleep(0.1) + # await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}") time.sleep(0.1) - await send_text_file(message.channel, processed_html) - time.sleep(0.1) - await send_text_file(message.channel, social["message"]["content"], "Social calculations:") - time.sleep(0.1) - await message.channel.send(f"Social+ {social_increment} | Social- {social_decrement} + Capital+ {capital_increment} | Capital- {capital_decrement} + Facts+ {facts_increment} | Facts- {facts_decrement}") - time.sleep(0.1) - await send_text_file(message.channel, capital["message"]["content"], "capital calculations:") - time.sleep(0.1) - await send_text_file(message.channel, facts["message"]["content"], "facts calculations:") + # await send_text_file(message.channel, [processed_html, summary, social["message"]["content"], capital["message"]["content"], facts["message"]["content"]], "Files") + await send_text_file(message.channel, ("Raw Article.txt", processed_html), ("Summary.txt", summary), ("Paragraph Relevance.txt", relevance_prompt), message="Files") time.sleep(0.1) except Exception as exc: await message.channel.send("❌ Sorry, an internal error has occurred. Please try again later or contact an administrator.") - await message.channel.send(f"```\n{exc}\n```") LOGGER.error(exc, exc_info=True) + await message.channel.send(f"```\n{exc}\n```") def extract_first_url(text: str) -> Optional[str]: diff --git a/news/pool.py b/news/pool.py index d56fac9..00e7232 100644 --- a/news/pool.py +++ b/news/pool.py @@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple import logging def process_html(html): - return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, + return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True, include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool")