big think

main
Brett 2025-06-27 13:26:23 -04:00
parent de6f9efbed
commit 275ac6e4ed
4 changed files with 45 additions and 16 deletions

View File

@ -34,14 +34,14 @@ logging.basicConfig(
article_repository = ArticleRepository()
async def send_chat(messages):
async def send_chat(messages, fmt):
return await AsyncClient(host="192.168.69.3:11434").chat(
model="deepseek-r1:8b",
messages=messages,
stream=False,
options={
'temperature': 0.5,
"num_ctx": 128000
# "num_ctx": 128000
},
think=True)
@ -60,13 +60,39 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
LOGGER.info("Received URL from %s: %s", message.author, url)
try:
processed_html = await article_repository.get_article(url)
title, processed_html = await article_repository.get_article(url)
paragraphs = processed_html.split("\n")
paragraphs = [f"\"Paragraph ({i + 1})\": {paragraph.strip()}" for i, paragraph in enumerate(paragraphs)]
processed_graphs = [{"role": "user", "content": paragraph} for paragraph in paragraphs]
# print(paragraphs)
# print(processed_graphs)
messages = [
{"role": "system", "content": "You are an expert article-analysis assistant."
# "You WILL respond in JSON format."
"Your job is to analyse paragraphs in the article and look for provocative, emotionally charged, and loaded language"
"You will analyse the paragraphs, determine if they are provocative, and if so, output a capital X for each problematic word."
"Questions you should ask yourself while reading the paragraph:"
"1. What is the literal meaning of the questionable word or phrase?"
"2. What is the emotional or social context of the questionable word or phrase?"
"3. Does that word or phrase have any connotations, that is, associations that are positive or negative?"
"4. What group (sometimes called a “discourse community”) favors one locution over another, and why?"
"5. Is the word or phrase “loaded”? How far does it steer us from neutral?"
"6. Does the word or phrase help me see, or does it prevent me from seeing? (This is important)"
"You will now be provided with the headline of the article then a paragraph from the article."
"The headline (title of the page) will be provided as \"Headline\": \"EXAMPLE HEADLINE\"."
"The paragraphs will be provided as \"Paragraph (numbered index)\": \"EXAMPLE PARAGRAPH\"."},
]
messages.extend(processed_graphs)
response = await send_chat(messages, "json")
print(response)
# TODO: parse `html`, summarise, etc.
await message.channel.send(f"✅ Article downloaded {len(processed_html):,} bytes.")
await send_text_file(message.channel, processed_html)
except:
LOGGER.exception("Playwright failed")
await send_text_file(message.channel, response['response'])
except Exception as exc:
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
await message.channel.send(f"```\n{exc}\n```")
def extract_first_url(text: str) -> Optional[str]:

Binary file not shown.

View File

@ -61,7 +61,7 @@ class PlaywrightPool:
await cls._pw.stop()
@classmethod
async def fetch_html(cls, url: str) -> str:
async def fetch_html(cls, url: str) -> tuple[str, str]:
if cls._browser is None:
await cls.start()
@ -70,7 +70,8 @@ class PlaywrightPool:
try:
await page.goto(url, wait_until="load", timeout=60_000)
html = await page.content()
return html
title = await page.title()
return title, html
finally:
await page.close()
@ -139,7 +140,7 @@ class ArticleRepository:
# ------------------------------------------------------------------ #
# public API
# ------------------------------------------------------------------ #
async def get_article(self, url: str) -> str:
async def get_article(self, url: str) -> tuple[str, str]:
"""
Main entry point.
Returns the processed text if it is already cached.
@ -150,29 +151,30 @@ class ArticleRepository:
async with self._lock:
row = self._row_for_url(url)
if row and row[3]: # row = (id, url, raw, processed)
if row: # row = (id, url, title, raw, processed)
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
return row[3] # processed_html already present
return row[2], row[4] # processed_html already present
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
raw_html = await PlaywrightPool.fetch_html(url)
title, raw_html = await PlaywrightPool.fetch_html(url)
processed_html = process_html(raw_html)
async with self._lock:
# Upsert:
self._conn.execute(
f"""
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
INSERT INTO {self._TABLE_NAME} (url, title, raw_html, processed_html)
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
ON CONFLICT(url) DO UPDATE SET
title=EXCLUDED.title,
raw_html=EXCLUDED.raw_html,
processed_html=EXCLUDED.processed_html
""",
(url, raw_html, processed_html),
(url, title, raw_html, processed_html),
)
self._conn.commit()
return processed_html
return title, processed_html
def close(self) -> None:
"""Close the underlying DB connection."""
@ -193,6 +195,7 @@ class ArticleRepository:
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
id INTEGER PRIMARY KEY AUTOINCREMENT,
url TEXT UNIQUE NOT NULL,
title TEXT NOT NULL,
raw_html TEXT NOT NULL,
processed_html TEXT NOT NULL
)
@ -202,7 +205,7 @@ class ArticleRepository:
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
cur = self._conn.cursor()
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
cur.execute(f"SELECT id, url, title, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
return cur.fetchone()
@staticmethod