news
parent
8838df750b
commit
6425845091
Binary file not shown.
116
news/main.py
116
news/main.py
|
@ -0,0 +1,116 @@
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from typing import Final, Optional, List
|
||||||
|
|
||||||
|
import discord
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
import re
|
||||||
|
from pool import PlaywrightPool
|
||||||
|
import trafilatura
|
||||||
|
import io
|
||||||
|
|
||||||
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
DISCORD_TOKEN: Final[str] = os.getenv("DISCORD_TOKEN")
|
||||||
|
|
||||||
|
ROLE_NAME = "Newsulizer"
|
||||||
|
|
||||||
|
intents = discord.Intents.default()
|
||||||
|
intents.message_content = True
|
||||||
|
|
||||||
|
bot = discord.Client(intents=intents)
|
||||||
|
|
||||||
|
LOGGER = logging.getLogger("Newsulizer")
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def send_text_file(channel: discord.abc.Messageable, content: str, filename: str = "article.md") -> None:
|
||||||
|
fp = io.BytesIO(content.encode("utf-8"))
|
||||||
|
file = discord.File(fp, filename=filename)
|
||||||
|
await channel.send("📄 Full article attached:", file=file)
|
||||||
|
|
||||||
|
|
||||||
|
async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
|
"""
|
||||||
|
Placeholder: download + analyse the article here.
|
||||||
|
|
||||||
|
Currently just acknowledges receipt so you can verify the event flow.
|
||||||
|
"""
|
||||||
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
|
try:
|
||||||
|
html = await PlaywrightPool.fetch_html(url)
|
||||||
|
# TODO: parse `html`, summarise, etc.
|
||||||
|
await message.channel.send(f"✅ Article downloaded – {len(html):,} bytes.")
|
||||||
|
await send_text_file(message.channel, trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, include_comments=False, favor_recall=True))
|
||||||
|
except:
|
||||||
|
LOGGER.exception("Playwright failed")
|
||||||
|
await message.channel.send("❌ Sorry, I couldn't fetch that page.")
|
||||||
|
|
||||||
|
|
||||||
|
def extract_first_url(text: str) -> Optional[str]:
|
||||||
|
"""Return the first http(s)://… substring found in *text*, or None."""
|
||||||
|
|
||||||
|
match = re.search(r"https?://\S+", text)
|
||||||
|
return match.group(0) if match else None
|
||||||
|
|
||||||
|
@bot.event
|
||||||
|
async def on_ready() -> None:
|
||||||
|
LOGGER.info("Logged in as %s (id=%s)", bot.user, bot.user.id)
|
||||||
|
await PlaywrightPool.start()
|
||||||
|
LOGGER.info("Playwright pool ready")
|
||||||
|
LOGGER.info("------")
|
||||||
|
|
||||||
|
@bot.event
|
||||||
|
async def on_message(message: discord.Message) -> None:
|
||||||
|
# Ignore our own messages
|
||||||
|
if message.author == bot.user:
|
||||||
|
return
|
||||||
|
|
||||||
|
is_dm = message.guild is None
|
||||||
|
|
||||||
|
overwrite = False
|
||||||
|
if not is_dm:
|
||||||
|
role = discord.utils.get(message.guild.roles, name=ROLE_NAME)
|
||||||
|
if role is None:
|
||||||
|
# The role doesn't even exist in this server
|
||||||
|
await message.channel.send(f"Warning! Role **{ROLE_NAME}** not found in this server.")
|
||||||
|
return
|
||||||
|
|
||||||
|
overwrite = role in message.channel.overwrites
|
||||||
|
|
||||||
|
# Either a DM or a channel message that mentions the bot
|
||||||
|
|
||||||
|
is_mention = (bot.user in message.mentions if message.guild else False) or overwrite
|
||||||
|
|
||||||
|
if not (is_dm or is_mention):
|
||||||
|
return
|
||||||
|
|
||||||
|
url = extract_first_url(message.content)
|
||||||
|
if not url:
|
||||||
|
await message.channel.send("Please send me a link to a news article.")
|
||||||
|
return
|
||||||
|
|
||||||
|
await message.channel.send(f"🔗 Thanks, <@{message.author.id}>! I’ve queued that article for analysis.")
|
||||||
|
|
||||||
|
# Launch the processing task without blocking Discord’s event loop
|
||||||
|
asyncio.create_task(handle_article_url(message, url))
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
if DISCORD_TOKEN is None:
|
||||||
|
raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")
|
||||||
|
|
||||||
|
try:
|
||||||
|
bot.run(DISCORD_TOKEN)
|
||||||
|
finally:
|
||||||
|
asyncio.run(PlaywrightPool.stop())
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
|
@ -0,0 +1,57 @@
|
||||||
|
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
||||||
|
from typing import Final, Optional
|
||||||
|
import asyncio
|
||||||
|
|
||||||
|
class PlaywrightPool:
|
||||||
|
_pw = None # playwright instance
|
||||||
|
_browser: Optional[Browser] = None
|
||||||
|
_ctx: Optional[BrowserContext] = None
|
||||||
|
_sema: asyncio.Semaphore # limit parallel pages
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def start(cls, max_concurrency: int = 4) -> None:
|
||||||
|
if cls._pw is not None:
|
||||||
|
return
|
||||||
|
|
||||||
|
cls._pw = await async_playwright().start()
|
||||||
|
cls._browser = await cls._pw.chromium.launch(
|
||||||
|
headless=True,
|
||||||
|
args=["--disable-blink-features=AutomationControlled"],
|
||||||
|
)
|
||||||
|
cls._ctx = await cls._browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
||||||
|
viewport={"width": 1280, "height": 800},
|
||||||
|
)
|
||||||
|
cls._sema = asyncio.Semaphore(max_concurrency)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def new_context(cls) -> None:
|
||||||
|
if cls._ctx:
|
||||||
|
await cls._ctx.close()
|
||||||
|
cls._ctx = await cls._browser.new_context(
|
||||||
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
||||||
|
viewport={"width": 1280, "height": 800},
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def stop(cls) -> None:
|
||||||
|
if cls._ctx:
|
||||||
|
await cls._ctx.close()
|
||||||
|
if cls._browser:
|
||||||
|
await cls._browser.close()
|
||||||
|
if cls._pw:
|
||||||
|
await cls._pw.stop()
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def fetch_html(cls, url: str) -> str:
|
||||||
|
if cls._browser is None:
|
||||||
|
await cls.start()
|
||||||
|
|
||||||
|
async with cls._sema: # throttle concurrency
|
||||||
|
page: Page = await cls._ctx.new_page()
|
||||||
|
try:
|
||||||
|
await page.goto(url, wait_until="load", timeout=60_000)
|
||||||
|
html = await page.content()
|
||||||
|
return html
|
||||||
|
finally:
|
||||||
|
await page.close()
|
|
@ -0,0 +1,16 @@
|
||||||
|
from pool import PlaywrightPool
|
||||||
|
import asyncio
|
||||||
|
import trafilatura
|
||||||
|
|
||||||
|
article = "https://financialpost.com/news/economy/ontario-and-quebec-economies-to-be-hardest-hit-this-year-by-trade-war-deloitte"
|
||||||
|
|
||||||
|
async def main() -> None:
|
||||||
|
await PlaywrightPool.start()
|
||||||
|
|
||||||
|
html = await PlaywrightPool.fetch_html(article)
|
||||||
|
|
||||||
|
print(trafilatura.extract(html))
|
||||||
|
|
||||||
|
await PlaywrightPool.stop()
|
||||||
|
|
||||||
|
asyncio.run(main())
|
Loading…
Reference in New Issue