big think
parent
de6f9efbed
commit
275ac6e4ed
Binary file not shown.
36
news/main.py
36
news/main.py
|
@ -34,14 +34,14 @@ logging.basicConfig(
|
||||||
|
|
||||||
article_repository = ArticleRepository()
|
article_repository = ArticleRepository()
|
||||||
|
|
||||||
async def send_chat(messages):
|
async def send_chat(messages, fmt):
|
||||||
return await AsyncClient(host="192.168.69.3:11434").chat(
|
return await AsyncClient(host="192.168.69.3:11434").chat(
|
||||||
model="deepseek-r1:8b",
|
model="deepseek-r1:8b",
|
||||||
messages=messages,
|
messages=messages,
|
||||||
stream=False,
|
stream=False,
|
||||||
options={
|
options={
|
||||||
'temperature': 0.5,
|
'temperature': 0.5,
|
||||||
"num_ctx": 128000
|
# "num_ctx": 128000
|
||||||
},
|
},
|
||||||
think=True)
|
think=True)
|
||||||
|
|
||||||
|
@ -60,13 +60,39 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
processed_html = await article_repository.get_article(url)
|
title, processed_html = await article_repository.get_article(url)
|
||||||
|
paragraphs = processed_html.split("\n")
|
||||||
|
paragraphs = [f"\"Paragraph ({i + 1})\": {paragraph.strip()}" for i, paragraph in enumerate(paragraphs)]
|
||||||
|
processed_graphs = [{"role": "user", "content": paragraph} for paragraph in paragraphs]
|
||||||
|
# print(paragraphs)
|
||||||
|
# print(processed_graphs)
|
||||||
|
|
||||||
|
messages = [
|
||||||
|
{"role": "system", "content": "You are an expert article-analysis assistant."
|
||||||
|
# "You WILL respond in JSON format."
|
||||||
|
"Your job is to analyse paragraphs in the article and look for provocative, emotionally charged, and loaded language"
|
||||||
|
"You will analyse the paragraphs, determine if they are provocative, and if so, output a capital X for each problematic word."
|
||||||
|
"Questions you should ask yourself while reading the paragraph:"
|
||||||
|
"1. What is the literal meaning of the questionable word or phrase?"
|
||||||
|
"2. What is the emotional or social context of the questionable word or phrase?"
|
||||||
|
"3. Does that word or phrase have any connotations, that is, associations that are positive or negative?"
|
||||||
|
"4. What group (sometimes called a “discourse community”) favors one locution over another, and why?"
|
||||||
|
"5. Is the word or phrase “loaded”? How far does it steer us from neutral?"
|
||||||
|
"6. Does the word or phrase help me see, or does it prevent me from seeing? (This is important)"
|
||||||
|
"You will now be provided with the headline of the article then a paragraph from the article."
|
||||||
|
"The headline (title of the page) will be provided as \"Headline\": \"EXAMPLE HEADLINE\"."
|
||||||
|
"The paragraphs will be provided as \"Paragraph (numbered index)\": \"EXAMPLE PARAGRAPH\"."},
|
||||||
|
]
|
||||||
|
messages.extend(processed_graphs)
|
||||||
|
response = await send_chat(messages, "json")
|
||||||
|
print(response)
|
||||||
# TODO: parse `html`, summarise, etc.
|
# TODO: parse `html`, summarise, etc.
|
||||||
await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
|
await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.")
|
||||||
await send_text_file(message.channel, processed_html)
|
await send_text_file(message.channel, processed_html)
|
||||||
except:
|
await send_text_file(message.channel, response['response'])
|
||||||
LOGGER.exception("Playwright failed")
|
except Exception as exc:
|
||||||
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
||||||
|
await message.channel.send(f"```\n{exc}\n```")
|
||||||
|
|
||||||
|
|
||||||
def extract_first_url(text: str) -> Optional[str]:
|
def extract_first_url(text: str) -> Optional[str]:
|
||||||
|
|
Binary file not shown.
25
news/pool.py
25
news/pool.py
|
@ -61,7 +61,7 @@ class PlaywrightPool:
|
||||||
await cls._pw.stop()
|
await cls._pw.stop()
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def fetch_html(cls, url: str) -> str:
|
async def fetch_html(cls, url: str) -> tuple[str, str]:
|
||||||
if cls._browser is None:
|
if cls._browser is None:
|
||||||
await cls.start()
|
await cls.start()
|
||||||
|
|
||||||
|
@ -70,7 +70,8 @@ class PlaywrightPool:
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="load", timeout=60_000)
|
await page.goto(url, wait_until="load", timeout=60_000)
|
||||||
html = await page.content()
|
html = await page.content()
|
||||||
return html
|
title = await page.title()
|
||||||
|
return title, html
|
||||||
finally:
|
finally:
|
||||||
await page.close()
|
await page.close()
|
||||||
|
|
||||||
|
@ -139,7 +140,7 @@ class ArticleRepository:
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
# public API
|
# public API
|
||||||
# ------------------------------------------------------------------ #
|
# ------------------------------------------------------------------ #
|
||||||
async def get_article(self, url: str) -> str:
|
async def get_article(self, url: str) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Main entry point.
|
Main entry point.
|
||||||
• Returns the processed text if it is already cached.
|
• Returns the processed text if it is already cached.
|
||||||
|
@ -150,29 +151,30 @@ class ArticleRepository:
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
row = self._row_for_url(url)
|
row = self._row_for_url(url)
|
||||||
|
|
||||||
if row and row[3]: # row = (id, url, raw, processed)
|
if row: # row = (id, url, title, raw, processed)
|
||||||
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
LOGGER.info(f"[ArticleRepository] Found cached article for {url}")
|
||||||
return row[3] # processed_html already present
|
return row[2], row[4] # processed_html already present
|
||||||
|
|
||||||
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
LOGGER.info(f"[ArticleRepository] Downloading article for {url}")
|
||||||
raw_html = await PlaywrightPool.fetch_html(url)
|
title, raw_html = await PlaywrightPool.fetch_html(url)
|
||||||
processed_html = process_html(raw_html)
|
processed_html = process_html(raw_html)
|
||||||
|
|
||||||
async with self._lock:
|
async with self._lock:
|
||||||
# Upsert:
|
# Upsert:
|
||||||
self._conn.execute(
|
self._conn.execute(
|
||||||
f"""
|
f"""
|
||||||
INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html)
|
INSERT INTO {self._TABLE_NAME} (url, title, raw_html, processed_html)
|
||||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
||||||
ON CONFLICT(url) DO UPDATE SET
|
ON CONFLICT(url) DO UPDATE SET
|
||||||
|
title=EXCLUDED.title,
|
||||||
raw_html=EXCLUDED.raw_html,
|
raw_html=EXCLUDED.raw_html,
|
||||||
processed_html=EXCLUDED.processed_html
|
processed_html=EXCLUDED.processed_html
|
||||||
""",
|
""",
|
||||||
(url, raw_html, processed_html),
|
(url, title, raw_html, processed_html),
|
||||||
)
|
)
|
||||||
self._conn.commit()
|
self._conn.commit()
|
||||||
|
|
||||||
return processed_html
|
return title, processed_html
|
||||||
|
|
||||||
def close(self) -> None:
|
def close(self) -> None:
|
||||||
"""Close the underlying DB connection."""
|
"""Close the underlying DB connection."""
|
||||||
|
@ -193,6 +195,7 @@ class ArticleRepository:
|
||||||
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
|
CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} (
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
url TEXT UNIQUE NOT NULL,
|
url TEXT UNIQUE NOT NULL,
|
||||||
|
title TEXT NOT NULL,
|
||||||
raw_html TEXT NOT NULL,
|
raw_html TEXT NOT NULL,
|
||||||
processed_html TEXT NOT NULL
|
processed_html TEXT NOT NULL
|
||||||
)
|
)
|
||||||
|
@ -202,7 +205,7 @@ class ArticleRepository:
|
||||||
|
|
||||||
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
|
def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]:
|
||||||
cur = self._conn.cursor()
|
cur = self._conn.cursor()
|
||||||
cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
|
cur.execute(f"SELECT id, url, title, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,))
|
||||||
return cur.fetchone()
|
return cur.fetchone()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
|
|
Loading…
Reference in New Issue