57 lines
2.0 KiB
Python
57 lines
2.0 KiB
Python
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
|
|
from typing import Final, Optional
|
|
import asyncio
|
|
|
|
class PlaywrightPool:
|
|
_pw = None # playwright instance
|
|
_browser: Optional[Browser] = None
|
|
_ctx: Optional[BrowserContext] = None
|
|
_sema: asyncio.Semaphore # limit parallel pages
|
|
|
|
@classmethod
|
|
async def start(cls, max_concurrency: int = 4) -> None:
|
|
if cls._pw is not None:
|
|
return
|
|
|
|
cls._pw = await async_playwright().start()
|
|
cls._browser = await cls._pw.chromium.launch(
|
|
headless=True,
|
|
args=["--disable-blink-features=AutomationControlled"],
|
|
)
|
|
cls._ctx = await cls._browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
|
viewport={"width": 1280, "height": 800},
|
|
)
|
|
cls._sema = asyncio.Semaphore(max_concurrency)
|
|
|
|
@classmethod
|
|
async def new_context(cls) -> None:
|
|
if cls._ctx:
|
|
await cls._ctx.close()
|
|
cls._ctx = await cls._browser.new_context(
|
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36",
|
|
viewport={"width": 1280, "height": 800},
|
|
)
|
|
|
|
@classmethod
|
|
async def stop(cls) -> None:
|
|
if cls._ctx:
|
|
await cls._ctx.close()
|
|
if cls._browser:
|
|
await cls._browser.close()
|
|
if cls._pw:
|
|
await cls._pw.stop()
|
|
|
|
@classmethod
|
|
async def fetch_html(cls, url: str) -> str:
|
|
if cls._browser is None:
|
|
await cls.start()
|
|
|
|
async with cls._sema: # throttle concurrency
|
|
page: Page = await cls._ctx.new_page()
|
|
try:
|
|
await page.goto(url, wait_until="load", timeout=60_000)
|
|
html = await page.content()
|
|
return html
|
|
finally:
|
|
await page.close() |