diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 7da7b5e..267ca64 100644 Binary files a/news/__pycache__/pool.cpython-311.pyc and b/news/__pycache__/pool.cpython-311.pyc differ diff --git a/news/main.py b/news/main.py index 78f2907..159c58a 100644 --- a/news/main.py +++ b/news/main.py @@ -34,14 +34,14 @@ logging.basicConfig( article_repository = ArticleRepository() -async def send_chat(messages): +async def send_chat(messages, fmt): return await AsyncClient(host="192.168.69.3:11434").chat( model="deepseek-r1:8b", messages=messages, stream=False, options={ 'temperature': 0.5, - "num_ctx": 128000 + # "num_ctx": 128000 }, think=True) @@ -60,13 +60,39 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info("Received URL from %s: %s", message.author, url) try: - processed_html = await article_repository.get_article(url) + title, processed_html = await article_repository.get_article(url) + paragraphs = processed_html.split("\n") + paragraphs = [f"\"Paragraph ({i + 1})\": {paragraph.strip()}" for i, paragraph in enumerate(paragraphs)] + processed_graphs = [{"role": "user", "content": paragraph} for paragraph in paragraphs] + # print(paragraphs) + # print(processed_graphs) + + messages = [ + {"role": "system", "content": "You are an expert article-analysis assistant." + # "You WILL respond in JSON format." + "Your job is to analyse paragraphs in the article and look for provocative, emotionally charged, and loaded language" + "You will analyse the paragraphs, determine if they are provocative, and if so, output a capital X for each problematic word." + "Questions you should ask yourself while reading the paragraph:" + "1. What is the literal meaning of the questionable word or phrase?" + "2. What is the emotional or social context of the questionable word or phrase?" + "3. Does that word or phrase have any connotations, that is, associations that are positive or negative?" + "4. What group (sometimes called a “discourse community”) favors one locution over another, and why?" + "5. Is the word or phrase “loaded”? How far does it steer us from neutral?" + "6. Does the word or phrase help me see, or does it prevent me from seeing? (This is important)" + "You will now be provided with the headline of the article then a paragraph from the article." + "The headline (title of the page) will be provided as \"Headline\": \"EXAMPLE HEADLINE\"." + "The paragraphs will be provided as \"Paragraph (numbered index)\": \"EXAMPLE PARAGRAPH\"."}, + ] + messages.extend(processed_graphs) + response = await send_chat(messages, "json") + print(response) # TODO: parse `html`, summarise, etc. await message.channel.send(f"✅ Article downloaded – {len(processed_html):,} bytes.") await send_text_file(message.channel, processed_html) - except: - LOGGER.exception("Playwright failed") + await send_text_file(message.channel, response['response']) + except Exception as exc: await message.channel.send("❌ Sorry, I couldn't fetch that page.") + await message.channel.send(f"```\n{exc}\n```") def extract_first_url(text: str) -> Optional[str]: diff --git a/news/newsulizer.sqlite3 b/news/newsulizer.sqlite3 deleted file mode 100644 index feb5b52..0000000 Binary files a/news/newsulizer.sqlite3 and /dev/null differ diff --git a/news/pool.py b/news/pool.py index a858e94..d56fac9 100644 --- a/news/pool.py +++ b/news/pool.py @@ -61,7 +61,7 @@ class PlaywrightPool: await cls._pw.stop() @classmethod - async def fetch_html(cls, url: str) -> str: + async def fetch_html(cls, url: str) -> tuple[str, str]: if cls._browser is None: await cls.start() @@ -70,7 +70,8 @@ class PlaywrightPool: try: await page.goto(url, wait_until="load", timeout=60_000) html = await page.content() - return html + title = await page.title() + return title, html finally: await page.close() @@ -139,7 +140,7 @@ class ArticleRepository: # ------------------------------------------------------------------ # # public API # ------------------------------------------------------------------ # - async def get_article(self, url: str) -> str: + async def get_article(self, url: str) -> tuple[str, str]: """ Main entry point. • Returns the processed text if it is already cached. @@ -150,29 +151,30 @@ class ArticleRepository: async with self._lock: row = self._row_for_url(url) - if row and row[3]: # row = (id, url, raw, processed) + if row: # row = (id, url, title, raw, processed) LOGGER.info(f"[ArticleRepository] Found cached article for {url}") - return row[3] # processed_html already present + return row[2], row[4] # processed_html already present LOGGER.info(f"[ArticleRepository] Downloading article for {url}") - raw_html = await PlaywrightPool.fetch_html(url) + title, raw_html = await PlaywrightPool.fetch_html(url) processed_html = process_html(raw_html) async with self._lock: # Upsert: self._conn.execute( f""" - INSERT INTO {self._TABLE_NAME} (url, raw_html, processed_html) - VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}) + INSERT INTO {self._TABLE_NAME} (url, title, raw_html, processed_html) + VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type}) ON CONFLICT(url) DO UPDATE SET + title=EXCLUDED.title, raw_html=EXCLUDED.raw_html, processed_html=EXCLUDED.processed_html """, - (url, raw_html, processed_html), + (url, title, raw_html, processed_html), ) self._conn.commit() - return processed_html + return title, processed_html def close(self) -> None: """Close the underlying DB connection.""" @@ -193,6 +195,7 @@ class ArticleRepository: CREATE TABLE IF NOT EXISTS {self._TABLE_NAME} ( id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT UNIQUE NOT NULL, + title TEXT NOT NULL, raw_html TEXT NOT NULL, processed_html TEXT NOT NULL ) @@ -202,7 +205,7 @@ class ArticleRepository: def _row_for_url(self, url: str) -> Optional[Tuple[Any, ...]]: cur = self._conn.cursor() - cur.execute(f"SELECT id, url, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,)) + cur.execute(f"SELECT id, url, title, raw_html, processed_html FROM {self._TABLE_NAME} WHERE url = {self.cursor_type}", (url,)) return cur.fetchone() @staticmethod