diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc new file mode 100644 index 0000000..98fda27 Binary files /dev/null and b/news/__pycache__/pool.cpython-311.pyc differ diff --git a/news/main.py b/news/main.py index e69de29..dfbecb3 100644 --- a/news/main.py +++ b/news/main.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +import asyncio +import logging +import os +from typing import Final, Optional, List + +import discord +from dotenv import load_dotenv +import re +from pool import PlaywrightPool +import trafilatura +import io + +from playwright.async_api import async_playwright, Browser, BrowserContext, Page + +load_dotenv() + +DISCORD_TOKEN: Final[str] = os.getenv("DISCORD_TOKEN") + +ROLE_NAME = "Newsulizer" + +intents = discord.Intents.default() +intents.message_content = True + +bot = discord.Client(intents=intents) + +LOGGER = logging.getLogger("Newsulizer") +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", +) + +async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None: + fp = io.BytesIO(content.encode("utf-8")) + file = discord.File(fp, filename=filename) + await channel.send("📄 Full article attached:", file=file) + + +async def handle_article_url(message: discord.Message, url: str) -> None: + """ + Placeholder: download + analyse the article here. + + Currently just acknowledges receipt so you can verify the event flow. + """ + LOGGER.info("Received URL from %s: %s", message.author, url) + + try: + html = await PlaywrightPool.fetch_html(url) + # TODO: parse `html`, summarise, etc. + await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.") + await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True)) + except: + LOGGER.exception("Playwright failed") + await message.channel.send("❌ Sorry, I couldn't fetch that page.") + + +def extract_first_url(text: str) -> Optional[str]: + """Return the first http(s)://… substring found in *text*, or None.""" + + match = re.search(r"https?://\S+", text) + return match.group(0) if match else None + +@bot.event +async def on_ready() -> None: + LOGGER.info("Logged in as %s (id=%s)", bot.user, bot.user.id) + await PlaywrightPool.start() + LOGGER.info("Playwright pool ready") + LOGGER.info("------") + +@bot.event +async def on_message(message: discord.Message) -> None: + # Ignore our own messages + if message.author == bot.user: + return + + is_dm = message.guild is None + + overwrite = False + if not is_dm: + role = discord.utils.get(message.guild.roles, name=ROLE_NAME) + if role is None: + # The role doesn't even exist in this server + await message.channel.send(f"Warning! Role **{ROLE_NAME}** not found in this server.") + return + + overwrite = role in message.channel.overwrites + + # Either a DM or a channel message that mentions the bot + + is_mention = (bot.user in message.mentions if message.guild else False) or overwrite + + if not (is_dm or is_mention): + return + + url = extract_first_url(message.content) + if not url: + await message.channel.send("Please send me a link to a news article.") + return + + await message.channel.send(f"🔗 Thanks, <@{message.author.id}>! I’ve queued that article for analysis.") + + # Launch the processing task without blocking Discord’s event loop + asyncio.create_task(handle_article_url(message, url)) + +def main() -> None: + if DISCORD_TOKEN is None: + raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.") + + try: + bot.run(DISCORD_TOKEN) + finally: + asyncio.run(PlaywrightPool.stop()) + +if __name__ == "__main__": + main() diff --git a/news/pool.py b/news/pool.py new file mode 100644 index 0000000..9a0b847 --- /dev/null +++ b/news/pool.py @@ -0,0 +1,57 @@ +from playwright.async_api import async_playwright, Browser, BrowserContext, Page +from typing import Final, Optional +import asyncio + +class PlaywrightPool: + _pw = None # playwright instance + _browser: Optional[Browser] = None + _ctx: Optional[BrowserContext] = None + _sema: asyncio.Semaphore # limit parallel pages + + @classmethod + async def start(cls, max_concurrency: int = 4) -> None: + if cls._pw is not None: + return + + cls._pw = await async_playwright().start() + cls._browser = await cls._pw.chromium.launch( + headless=True, + args=["--disable-blink-features=AutomationControlled"], + ) + cls._ctx = await cls._browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + ) + cls._sema = asyncio.Semaphore(max_concurrency) + + @classmethod + async def new_context(cls) -> None: + if cls._ctx: + await cls._ctx.close() + cls._ctx = await cls._browser.new_context( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36", + viewport={"width": 1280, "height": 800}, + ) + + @classmethod + async def stop(cls) -> None: + if cls._ctx: + await cls._ctx.close() + if cls._browser: + await cls._browser.close() + if cls._pw: + await cls._pw.stop() + + @classmethod + async def fetch_html(cls, url: str) -> str: + if cls._browser is None: + await cls.start() + + async with cls._sema: # throttle concurrency + page: Page = await cls._ctx.new_page() + try: + await page.goto(url, wait_until="load", timeout=60_000) + html = await page.content() + return html + finally: + await page.close() \ No newline at end of file diff --git a/news/test.py b/news/test.py new file mode 100644 index 0000000..190491c --- /dev/null +++ b/news/test.py @@ -0,0 +1,16 @@ +from pool import PlaywrightPool +import asyncio +import trafilatura + +article = "https://financialpost.com/news/economy/ontario-and-quebec-economies-to-be-hardest-hit-this-year-by-trade-war-deloitte" + +async def main() -> None: + await PlaywrightPool.start() + + html = await PlaywrightPool.fetch_html(article) + + print(trafilatura.extract(html)) + + await PlaywrightPool.stop() + +asyncio.run(main()) \ No newline at end of file