From 301483810e1fc65eae7ad3ac2ef90d61defdc83d Mon Sep 17 00:00:00 2001 From: Brett Laptop Date: Thu, 3 Jul 2025 21:33:49 -0400 Subject: [PATCH] screw it i think im done with this. --- news/__pycache__/pool.cpython-311.pyc | Bin 18481 -> 21031 bytes news/__pycache__/server.cpython-311.pyc | Bin 0 -> 1916 bytes news/main.py | 2 +- news/pool.py | 86 +++++++++++++++--------- news/server.py | 15 ++--- 5 files changed, 62 insertions(+), 41 deletions(-) create mode 100644 news/__pycache__/server.cpython-311.pyc diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 02e1dfc1365247bc01842472b77b71145db58bbe..e6ee685346a918b04dc420da314bb245f34e45f4 100644 GIT binary patch delta 3773 zcmbVPeN0=|6~FhrXB*oO+xP?XX+Q8Y@dH9WAX%D}CIm>*5I}(}0rEjTp7|IAllM$X zs8h2o(`b_>G&fsNn@mKSnpRb(rEaHAs-`rVwsg`Y8@U4^Wn&OP_tbI!fKF@dlD44bZJW*QJ&ZEJga&sW_r&0&4Ualg_p2S*0U zzwqC*TkpuRp8n_`nf)Bjjrw$6220T<{Xa0?7~O1~V)3@4Kx>6oFRzj9&>CdN8Ps8veR4K% zJhV<|Gi6E6h1L`>%cXMu8SXW3_D$+>>guq7yi6|sf;>y+T9H&fmlZwZxW(ZfvODi1 zoJR`t^T<)LGUGH8W)ypvA+KbXkfD4J#-yX5f%I$ib;ZS<32|r4_4((8=JEQ8Eq4p0*M0HA%0yx1cvDh~GP7FF z?VaIW)4VIm=&}mti@b@V4M`19Jh!Gix^8iNf!DRJ~1LoZW6KjoF%v$;jJ@#!8BhG zD{PM)I7G3qJLIgad)wIvNg^ZCHSHkUj)ufRMC6hY+Jnx;7H1gl$#LB3rQn5 zvarwxl;QuF%)){y!I%5~+8Ph~z+_~TksU|a}Lrb zZqd>8Q0;nBJ+d1S{{5^_)s8_uWTIxqoELQC5Mw-!I3$xbj z8LMa7>Y1vITYU+uZ@d;(kLF|Htg|4gTVdcHwf6uSBdapKY;*&HlSXNJpt#kWuzFz_ ztK)Ln#j=>|aIE!U>`+^xwVl$g!*T18g!RaH?YtxB@{x;2UOsl|7#uZhP^7@A$%9ih zS0e4N7rk<9@)-5d@mLSPse{?bo|AO0)>LGL;<9;u{r7&Ha%vC%DwApv3{bE8K^2IDsJjCY$}dsP$Gg?Tur z{?glz+evZBZmc885;u9Fq?`1W2;{R83)T~1&G)bYfDka^e0{N4)i5cyFDB`0#N}F-zqv%&Iz^kdyR|Y9H0Whca4-Ewd z)+HUKcoi44eMd%(SsY_(Hm*nDbG=OgE+QrVvIPt zZ(BA~`54ERm(i3bE6eCv^5bd`-X8r`^$jL%mTvIMDKfmH1=mII>~J#pT{6G3O#dF0 z{|JECYwK_;X|KKGmblc$uk56KUJ7FX5|gH1BCfhTC;i(4dcCxFN$)MvT(<>V$#~s* z5$ZEi#wpO#hjNj^bqY^XI6>ho`9ocmt^-K56qAhl`j)*oM+4`>l~85Sc#~)){==xFbdeY=|RNpfXZIsPsAaMujx$hF=3x9{dKn0h?kOUWO6 JNpEUT_#bgs_ILmQ delta 2162 zcmb7FdrVVT7(eIU+vn|zQm92*E<%+mkJbVc+{B6vpa_C<(Po=LZW|O+y4$LNoz=(| zwwOu$OmOj4GlOm>F6?fZ**|+Y7hJNXkWO5u37Pw2S+cD}mYFQsb}kI@waL!C=XcKc zyXX7P@B8k#x#!Q};Q_2Wuhps%X~SRoJI*?Xbt%lyk?sgg;veJ{Bc{amfcPEN!eFad zqOeo)60$V>3an;Vhy$v3Fs>9=vT+*Mh&OfDC~OvoG8!q|40kf$!%Z+`dLg!-I1C~` z@?jqNC?5N0p7Jp~?USjH3h_2Rk7xU2J~_|v`aZ^|;0=Aqr{web3}UIsXe6VWU&Ncp z$oe&W0iWH+43Vf8NuOyxE#V7!>m$65mob}=!!Dl5`kcX?z~_8JB`7eIS}=x7j!L;u z6JZd7V5u#}Q&D^ZouY;?2~GM;!zfDev}M+mEN0FMQ0*io=^CPl7wI$Y5MlSx&!Iv3 zRR*I?Qld{<#iC$F0dgV96A)*xf6(2{7+j6ALcd{KT-KZ!qrhaDhp7g>?%00NFZ zA*+S*tKn;RF1%}94ZmBp^mT_C^tM{?*_Og#n-k2dbTDRfWI0$NOfpDmwh3K(w*|U{ zZM%hCuprON(1ZX!&j}j)ve*%_Cj1L2|BvK4fYJtJT5zm7jby6D^MD)1>~Ukkn6V&U zJ8CRS8jJdCC-lZNqO^-|8L|i2qiiC#Wu%q>xknAHNkePG(0aq1bKG@o&+%~F6+5|S zV1Jr@bhk-K!PGou%qW$UUDJvOLOn4`Y+h;pcO~xLz5BQYzv+-le}sLVO_=IGaT6f- zsInocY)B{@ZgHjKT-6vCtGZk;%B@Ors}ea6->>QWW&Pc^EcSTEsHH4vDeKowWSXUy zqg8S9fC-J^@Var0ZA@cJQ;H1JgvFY)IMOmQO(V%JaV1?P&)i3Nyh%^XefH+9$<6*b zd)f=K^4+oY!UrTwN{XiDDQh0eAGc(>%Kz-T_5Sp2$dEMi)Cv#bu@c5+bb3}6*J?<&J;NI4H;c6zRX!>&_U3rm;t zcrP4T{ubuJRh|#ex^41eF47b7gQ1**qwWe#O4;e}33crVMtTF%8u2H0GhW$E1PLJm zDS7*DA>1Vh`y?hPG0K2H(%#V(^rz%)p^!hwr|8}tp_DAJtE(*{L}6FO7FmiWZg8za zG>OC|yqli=zHdke2o6>i!R5*?mQ8pYlHSc}8B<73#63_VbAH@@At#YlKB;0BR8AoR z;^0$TD7~#7(<4}2&niZAxSoYSt1hxBmAAXyABybg3c{soCv_W%6V>ml*lER4N%l!N zv$lfz8Hv}|W>aobXf&6O&__%MEwo9jM}jv9go$9pahxFiFobT%t1H0yVr|_pH50Z+ zqI==5hE}{-Y+Yxj@CDe}xJ)Je*iy8=yB$t9uEiB_yKw}*+F&-HCcJ|YIwZF>5=iGn zh{GQnR$x7pH#xHoNW_~GJT1XX5_Cg}m8M)MM~9e=k2L sxtxbJ8=G+t9Nk!qKZ1`pj$;iBwzSmvX-rD|c$}Q(r~c)su#m+48;!OuEdT%j diff --git a/news/__pycache__/server.cpython-311.pyc b/news/__pycache__/server.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7b9411f845eec4faa80fa230bf63c35a3380b058 GIT binary patch literal 1916 zcma)6-)q}O9KVxfOR^F*c9%6jI;L!)#4L@QgthAkg)MC}Xvjvgy?76HGcO_fDVB z=k)vFU<^U4eNfo=lSk+e_7FCqGdW3Mgq|QB>DWX?oW?~i%@z4HUlh^;W`53$6r<^A zFy~D%ErL%lrJ|gcG2#%xB(?`Np25G}(h7LxJ$QH)x=?xmeB$O{=+GnQ)~G~jTnMAR zqaM}80zQ!TIH(8zcGG1tw9e_VZQyJ z@kL6$tq|Ms#ZB8P8S7O)lFeDvNn&64BI|6dz$E?&b&R}8R!P~i4acHYKWY>D9ig>L zscbuDrxrT1Oof`ae17T9m#|bU5@*BGZC~k$=DU!>Q5a!BB08yod4e3UVQci}g-*1E z`XPRRw{Sh~M7jw;xy~K%{lH@6h-VpFxW5JfBdomo1@=1)Usfwwj<#-9O8QgmD>||B z)F?ZKRRZi1;mgo7M3e$?G95pS!d8@>D9O_E zk(3-eRWO{?TCkCwDv|H)R2$}*a+O|WC7EGrvKPSA(Mwf(p-w+nr}w{Zsy99LW_{^U zRO>$+NrO9i_uV;Hnr}+;o-_}WBPH?pMpGI0lyO%WKY{hAb|GZ%YOZ<}Xv;Hi*t*1w z04)8)Y<7uhbd?(}W21f!-UmRM9q%PT=XCx+_!mXeS(Qc)p=Vw_vW0#{zj6Ka0l5t2 zPHXfK9s1ClWYd$&Zscpu6z{2V?!7=Z(u&Uma>iqTq&9IMruQ|tP_(L*vos6pCsZ?R zjomVX5}gh~wc6-i;^a4sQbFs;Ld#XPuU3~o)~E_`$|z3Clri87Sz54t9@G`6%8xF8 zzO-~_)fbG?xjJ%KufXs&(kkr zW6xt_yB{`V6JBhhzIdppErcVtaYs>~y3%4(TJ)qvn7kBXkM2FZ=MK&8FM@&I6y`i( z&K2g4y24y*sd2jw*Gs^VyHqwpHhk zhp)K9SDM39-td%*636n`&LvMC+x@61PkQoXJ$j6VN23o%U1f5=vR|l=Ht}^2UvF^& vzP!8ELNJ_}zj(ym2&1h5#6{XmFE=WUVCjs9XIh-h)_`H@sXkBO4v_f=`}42T literal 0 HcmV?d00001 diff --git a/news/main.py b/news/main.py index 0840374..c89ef45 100644 --- a/news/main.py +++ b/news/main.py @@ -196,7 +196,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None: LOGGER.info("Received URL from %s: %s", message.author, url) try: - title, processed_html = await server.article_repository.get_article(url) + title, processed_html = await server.article_repository.get_article_async(url) if await server.article_repository.has_paragraphs(url): await message.channel.send("This article has already been processed.") diff --git a/news/pool.py b/news/pool.py index 78cdc62..be99a49 100644 --- a/news/pool.py +++ b/news/pool.py @@ -15,10 +15,6 @@ def process_html(html): include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool") -# logging.basicConfig( -# level=logging.INFO, -# format="%(asctime)s [%(levelname)s] %(name)s: %(message)s", -# ) class PlaywrightPool: _pw = None # playwright instance @@ -140,7 +136,46 @@ class ArticleRepository: # ------------------------------------------------------------------ # # public API # ------------------------------------------------------------------ # - async def get_article(self, url: str) -> tuple[str, str]: + + async def get_article_async(self, url: str) -> tuple[str, str]: + async with self._lock: + result = self._get_article(url) + if result: + return result + + LOGGER.info(f"[ArticleRepository] Downloading article for {url}") + title, raw_html = await PlaywrightPool.fetch_html(url) + processed_html = process_html(raw_html) + + # Upsert: + self._conn.execute( + f""" + INSERT INTO articles (url, title, raw_html, processed_html) + VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type}) + ON CONFLICT(url) DO UPDATE SET + title=EXCLUDED.title, + raw_html=EXCLUDED.raw_html, + processed_html=EXCLUDED.processed_html + """, + (url, title, raw_html, processed_html), + ) + self._conn.commit() + + return title, processed_html + + def get_article(self, url: str) -> tuple[str, str] | None: + try: + self._lock.acquire() + return self._get_article(url) + except Exception as exc: + LOGGER.exception(f"[ArticleRepository] Error while getting article for {url}") + LOGGER.exception(exc) + return None + finally: + if self._lock.locked(): + self._lock.release() + + def _get_article(self, url: str) -> tuple[str, str] | None: """ Main entry point. • Returns the processed text if it is already cached. @@ -148,33 +183,14 @@ class ArticleRepository: """ # Single writer at a time when using sqlite3 – avoids `database is locked` - async with self._lock: - row = self._row_for_url(url) + row = self._row_for_url(url) - if row: # row = (id, url, title, raw, processed) - LOGGER.info(f"[ArticleRepository] Found cached article for {url}") - return row[2], row[4] # processed_html already present + if row: # row = (id, url, title, raw, processed) + LOGGER.info(f"[ArticleRepository] Found cached article for {url}") + return row[2], row[4] # processed_html already present - LOGGER.info(f"[ArticleRepository] Downloading article for {url}") - title, raw_html = await PlaywrightPool.fetch_html(url) - processed_html = process_html(raw_html) - - async with self._lock: - # Upsert: - self._conn.execute( - f""" - INSERT INTO articles (url, title, raw_html, processed_html) - VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type}, {self.cursor_type}) - ON CONFLICT(url) DO UPDATE SET - title=EXCLUDED.title, - raw_html=EXCLUDED.raw_html, - processed_html=EXCLUDED.processed_html - """, - (url, title, raw_html, processed_html), - ) - self._conn.commit() - - return title, processed_html + LOGGER.info(f"[ArticleRepository] Article was not found for {url}") + return None async def has_paragraphs(self, url) -> bool: async with self._lock: @@ -190,6 +206,16 @@ class ArticleRepository: return False return True + def get_latest_articles(self, count): + try: + self._lock.acquire() + cur = self._conn.cursor() + row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,)) + + return row.fetchall() + finally: + self._lock.release() + async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings): async with self._lock: article_id = self._row_for_url(url)[0] diff --git a/news/server.py b/news/server.py index adb2f1c..6afb151 100644 --- a/news/server.py +++ b/news/server.py @@ -1,5 +1,6 @@ from flask import Flask, request, jsonify, abort from pathlib import Path +import logging # Import the repository class from the existing code base. # Adjust the relative import path if pool.py lives in a package. @@ -9,12 +10,10 @@ app = Flask(__name__) article_repository = ArticleRepository() +LOGGER = logging.getLogger("server") + @app.route("/articles/", methods=["GET"]) def get_article(article_url: str): - """ - Fetch one article by its numeric primary key. - Responds with the whole row in JSON or 404 if not present. - """ article = article_repository.get_article(article_url) if article is None: abort(404, description="Article not found") @@ -22,16 +21,12 @@ def get_article(article_url: str): @app.route("/article-by-url", methods=["GET"]) def get_article_by_url(): - """ - Same as above but lets a client specify the canonical URL instead of the ID: - - GET /article-by-url?url=https://example.com/foo - """ url = request.args.get("url") if not url: abort(400, description="`url` query parameter is required") - article = await article_repository.get_article(url) + LOGGER.info(f"Fetching article by URL: {url}") + article = article_repository.get_article(url) if article is None: abort(404, description="Article not found") return jsonify(article)