From 53e22c84651d9ff57ed89c0515eacac32cc56aea Mon Sep 17 00:00:00 2001 From: Brett Laptop Date: Sun, 6 Jul 2025 22:12:15 -0400 Subject: [PATCH] comes together --- news/__pycache__/pool.cpython-311.pyc | Bin 21080 -> 25183 bytes news/__pycache__/server.cpython-311.pyc | Bin 2225 -> 5973 bytes news/pool.py | 77 +++++++- news/server.py | 67 ++++++- news/static/index.html | 128 +++++++++++++ news/static/view.html | 228 ++++++++++++++++++++++++ 6 files changed, 495 insertions(+), 5 deletions(-) create mode 100644 news/static/view.html diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 803a77ab98ac13c03964063cf9b7020b9f934935..acbcadf0d80145e24f1a10485540a6990f8bbab7 100644 GIT binary patch delta 6910 zcmbVRYit|Wm7W<6-xS5C-j=ABEt;}y$CBbW@xJ7}I0e3?%4I=DrfaXU5dE;GV4J@+f z+@T~=QQKfg!|$GZ?z!ild*|G9AIiVNe(*~+{c1&pfr00N=lt02-3;^Zm|1$%3c-H7 zon@FCjKBzNl!>uEmPMM2axs-pMQK%(kEwm?n8v4xX?@z5&Znb%KC1WWL6(mid0fahgj#c_9V-}w!X7yQPHlHnK_t|3(pQEVA>2nI2%Z#szV?xez z)k`)C+FM}U>+n+^Uk%6XV+7r0M$q5lpag!!l3H3~fD+?UNmsF`4wR|{)18WPWwe4B zDpV>J z{xh$VCMEek``gBP5UzxuxD{ZUdEZ!-GdA9oe$;c%kvSWle~-(mShf5E%{sda1PaD@ z=z>3(h)bbKi8RaaYihQ)0N>3ORKci7+JF|Z&n_3KgM{<|BprvMfr}SNWOPhAlt@Io zL@7X|dU=!fzcmd|9%SXiyhGlpdo$v}I-Llc5O9D98s_FnGb9CdFq#lU#D)Cr0JmB4 zI3yx|fMSctPS&kzvBYX@Li~!n$GBad)gNXX=KoFaWCu|@*@@sn=mse8qLdgXyMQd! z-w*kueU*Nc?3N!l-UfXi8l#-W1yZ#01cwJkDqwEwME!0xwU`V5UEs$9qajj@5?%li zUEYNcsgv)TH?lkDe{Z&MkF^3DMgRwwv?HuX*Z^>wE1`L+lFf1izBX_B9_h1tFG zA8amTiFvZ!e(vBN5WTG?r?HwJ;RM1-gi`=+4UBRq8eYX-kry=}MTZ2(`u;T15So?_-2}VPQ0wgd>0^?&M^_hYJ z<{a|<5h3Y}Mnvg^G%+3xoruJxPM0W=Q>O}s@#Q*6JrzNIK?BYm35rRx5D7{v^67#> zDe%VvG-N~=EFF&I%glU9GZFcu)7`-$fPrc3@7c>DCo^vH7~dA>bE-z?vr=bah) zW_f3xZ_Lm)%Qxovof-OW$?`k%yk*)u?S0Oh=i4&$&GKz|t9^P16{1Q+#f8E!O={Dx zLE9^?Ah14g1o(<7pIXo88Nn@V6pWYEK8@fJDlhXstr;iWRcL^m8Pau-Rtk>@O^{kbR$-gae0ei}3k-N2e#+ysK|#0B zad|c0P8XA&`R_S%tY$q3lMVCfY6HgxeuO^<3o6cU(Q zXR$s(kYrC?t&uJbI8~B$a(2+J^GD(l$?q5Cb9Hx|4Je1m9ZZm;BkH?RguI65u*rQ5>VuM`R(>r_V48n+eKJvo?Xv#UfSkWETsL{1Sg1? zAmp!rQ_zJ&QgAF055ZhnwufB9A{EGy8KBBmo!cEf?A!AnbbQJVee@|j57)0bn;|X~ zgN7^R8V3DAv~uau0s(`-etedd`|H=s4W7-FsuU-%=Xn$%?vzSitgh-4xD<1avXK3^ z5Dd%nFYzfZ#fG`^ea;jgZ&DO+0xyR>t!h5S&#C2RLyP<)&q2LA!QtK2+21PaQqY>QU);) zjTQ$Hb|Hz0xK!X_-IT};l*17%8^`ONyVJizW)ik0e}KPfCU394zGIQm>svkthlf>Hh{UaBm;y)$~)_3_up4@ z)r}7rfLYa|j&W4q=)2Z;b=UP>i}ufUD}5jP`rvbR_;ZZOS|Xo(^2u6kyv_JQ-fUrF-u7?vO`mp9Ri{_GFqbn^vMj)tP5Aa35zHUAV0#p2y zI>k>r^o;y+MT+Z3CEo%4z(UFEJAVYADU z?}8BB>%jq(>)KLK(@rcexKR4g(1$A+Yy-x3CA;Mxc2=dAjT5{2V1p8fio1YV64HY; zVguBiW-`@1$~*VeP4&&di@Ie7-O;&qWE9&ULx>=pL%`i7i6X=h@L=cGD?0sYmCw$s$AXbtEix&?7J4MWm(*&7fj;;xudti zOARbkmVXVgU|+?+Iy8VPN3c?)=fQ$`G$hG)JDTL5b@ETF4D5SgjW`WaqHO8grZ+D# ze-Y%$O%3VO_Xcy`lbOcT_y(exGx4k~k+UUW8FaP2YWj}pW%DcMtZCiMft%`u%Gyk2 zbH2KMv7$`*eQfK&Uv6-y*a-tT*F58oM{ zd19fqe&#^FwrP=JE4EOWRp&Q!zIHhM%H*u!X3w1U#qC*3TgK8>R1HgZ(>YwYk8>x` zhU45Q<`iBj#<&Yu5ap7%zS}QRgD;`MkM+IP@Z*MmYWzv#%)td$OUAS=|H$q|MpY4D z70gk?)Ye?F@kUOOPTYNS=+^&?IPF%yN&3z=+*f3 z_}t-K)rOgUSSw9O|3W4d$%Q5|jThjB-dtjbuv1BPAMMnB+Nu56smA7SN52()A^u`~ zW@MqIEn{lRcRY#%`Zxs`!&%eKaK3v-#?m_H&04l*EL%Ucx8>{G@uk!JwJ^-?Y;{kr zx+hp4qU!MtiTG-Cr|r?z+M2z6=t{m)r)qxv^pX*N=Re6Uq+| zt9HLic)E?zMU{+<7u57nPQD5Zx4q;{L4zQTdT&6H5)QwQ6b+F#*)!Ld** z@CErl2mc6Xtc9UAb~{zEM%*qvt@z%W~qt8%+(sT)8q+t_Vx+ z+2(hC!NJ10{fM{w*snnpu7W0{MDbv(E2&sg2Q~f{WmTccV1b7l-?)f7o~0E)xedGo zoMbz-=8s+*D_=W!D@~92BG$+5kO_qK2pbR%0l; z1+CvNB!X~*Itj%EqyJd{fOp70+<$N=t=t@b9ku*5!t)4ty+FQ=a2Mgn2>~7so>)+#y~Zhs!?sZw}R-AZ9GkMiZl>z@k?-1wHH|yub(%LT5x1D!{8& zx*4J!UNsUAb$zjfFcA&yBEJVYoc-eK^8Xxaur_CyvUfpt9j=zU4(sLa!~cy~+gui0v^G_-^$h}XXaY;}F zg=j*C$b?{9)WnQUwq+z)%GhOl#vwa6-=;ZbCyd)Pm+a!yExS4O$R19;vKOeN`7*tJ z*`KM9D>4B&kg1d_GgWd`rdqDvP!yDdihW6tL!zJtpWngm9IjoIYeeBOL2)h#itDl{ z2v^`&{N$Z{#0?{!Z6mdO#0w+7Z6kGx?YI#2{{vM-iRDdXv$~lu(|Xp3N>*jUn9nBT zIV~|iOVg0SaBaa?`|Sdw&H%3G!*uDc95|MjM8BQJ`p@cuq4COb96s6H%}gc z{zC8pbM@RK>=o}S=-cPh#7eFOcSdpVa14ct)*jR!wR3Z54BD2IOQcj*p<8qn-5XzfBfn9_NwjMeV z;RD!B!msq%iY;>QO?Gx?<6^7Qrg)d^vO{TC0!tEPjt->?s0(N{P`A>lgn)XK9gqmT zN)4b-i7GpR=LcE~v_k1p>VXE-N~K!~FG-hSjVq-Uu5<7p zevbVi+(c6BlknAUelF`R9Yiw*`URYZuwFE5zRHt!5v!wzDgZzCu;w zqKz1Ac~|5hcQtfCiV2A304%3ErzZ2JO0(E=B=t-tZAKlG+r&f4K~*EKne;GQ+Z82s ztiJhC$IBpWbg<*i`JD={q8d zN7=Fdsd2t4E=a$JfU6ZRp}1s1(;$X$Nl3tYK5NnoD1j>*5&(uU1n@um4-WqZd0`?5 ze}{kPgb#x|UmIEzBzN;e06RDoUVQPUmbBX%PQ*0AMBPy{#l`S@P(eWCET@n)%$`xI72%RN4q zRCD+=ftro3w7KjKv%ZP2_^D8soVez0;gh%z;wI_a+l0RkeYUGR6xf=;Bo=$}Ul2z9 zPqmT(u41b%^P}7xklcy42IP5m;prAKz^*<0IA|g#;^JOz$L4^Zf#ogFbebAw+)QUw zYNCBx;yS~Abh4A|XYZZ7*2ME=IT!QFB*To2GFf<84iU)MZ=Kq6?3Y|~B5&#&sn1m| zmD_9RuTj>f&Lu4gv$^4dK87KF#bU0r*3(^Nk{vsJd66gd6t0Ta(=`lF4tS0d=nC|o`m;|27JpKe=;^YIUYY6J2GA@%M*`J=S{4r zafg~~7QzT#6nkxNqvRof992u_3ngm)1N2sHo@e)BmM19mapiespcdwR7$z)@@k8Z8E)&g&3@_{&ge>BQxDMJf(Ybp!8pOL jo(Z#$;kS0i!TxjR&PZfLBB3?mf8$-wzyAfJ{E_uvseLOZ diff --git a/news/__pycache__/server.cpython-311.pyc b/news/__pycache__/server.cpython-311.pyc index 935bcded819dc103984aaffd55013c2b4e7e9efc..1e2f024434389d64464da1e523569815fc75ced1 100644 GIT binary patch literal 5973 zcmd5=-ESMm5#RgZ_lXoGQWo_|+lpl-mT60NoLI7z#8xE3g45cG+X@L7N_Up1@GE<# zBzCBliH$@ETvP!ZL_l4X4@GJtdGMnj^C$3x14JC)LO>Cqeq-PSDE!o!`=IzCp=t8c zy}RMw?9A@$?#yrJ`1iIpFM;y!=iXU*O(5i-_+l5{syz7zOURc*BqEb0bLKBI$G|(A zW;5Izm*MC5jAPD`an3n2t~nQD%CK`R$Z%3Ujp zLwo9an{<6Gy5T)_g(ls$7TvBrb^T4cffn6;d+N4}!A0hqj;~<9zlO&wVyD!7jTJ+; z_@;G>?AcFv*M7QMEa};^hkd*D(B0BQ@18wGcI~03rH4L|xk6&Sk8!e>Vh-K$>y;!` zbQhK0U6Ev{EX%oU>e_YPk(|##L#Hfd#l$t5%Ou1Ul@^p7y&hw9{#Qw534VhwQzf;K zmM%;AoSec&-6>1-J&8VL;tST28b26bH{p3Q3gjYTAjFhf5D*A9k!{?Fn-EqXa#zWh z+$TJk%w2+>l_ZRI!<)*A(g#PEluY_QLtW6{eewh-d3r3ql*>r*c`7MN{CsX1X&QY|45m(a(QF!yM)$(g zvf3Z2!Er4(UiP)z{J1#@hcSr;`+vbY(-2qzD~$#<+sos0XargY(bNM#>^1KBy!TFT z+1J;6^L@5UB=f2Gl9WsUD@RK1- zB9qkomByw=nhc%79}NN@u?HWp=eQzrBV3N zT~>>X?iGz1(MN}&ZV;@~Fq{(-oJ$_=iXCwzY-w z;Aq)Brn<*8_ZU09vRvbr z>bQ0*B5>mh=Mj-%gq>7HY}%n!^0UX{8+(Sano^uqMln(~)4mdJPfIBf@Cw+Tox{7q z%h*M-$`#o2RmZAx)m7k1b$+$eX~9u&7F?q9E`(#Fbqu+EK*3P6N`Y%^ z2VZnu(j5!ndWz1Do_xFtf;yj0%E}FPOsltRNOcbXc`TT!;|8`+3Ft`~QEIHM{uof6D2jcfm zsJ>&G?^yBtqww%ncziQF{&4Es6KZ%y3(u4T&3AETI}j|+RAR@ggm(xAQQgn1F>7P% zj`iPH-0gn~d=}W~tb}_i14pV3ZqQpLCW-jIC!9z40SWLd@}rjso$D{CzCH~w+y?ES z5pf9m_EI-Q9eSe?b*I7Otf&K)U6yMoSVF*PDb3Nf3J56&>k&?5MeZBE(VqbWagVH0 zszXehM}7*@*7>{W5S^`~6qw~&gh9O~5B1vkQ1{rQrikde>sV!hSBp}Jvyb0|xvz4o ze1Ttt*j)D6^1H7|a~z58yAZFU2hUuy6u1efKu_?oW^;C0VZ@m?0oAOyZqnS56TLh2 z5~eCG1!G^qQ`IWVc<>rU^c7r91Ar43Zr@b)+qz(R6M_O#1TN7R zLE!N^ILy=iZ$oUp9fj1JUXSLJGzo{XM58G=iupL?+aetTVR`_`K_D@Q8ObhrC6h^F zwyosysRdd0n!kxmGOv5Fr$w5~FUc`Z=Mif3Hlsa3A%V^U#P}_Ji$Pz(0nP)_x!+A) zdHq$;V^bpF7WS??O|@9iKq-nDJRL(~oSFtw`4KpF#x<;@lPL2Nl9!PfgG@n957s1Y zi|9$@vsA+Gha_ECvf&LEOBd|pTTCiRiaEM2>~sN!`gh)p>AKD7rbIde!t?@=-O#Q+ z+9@9AXHbyB)>vrg4cn8Q$eL?i(A@nSY1RE)RUn5?Xv0(ArnTWKRpRhKZ$MPx;M&Z(v~jfp zKsl=k`%6bOA^sqs2`_C4)0@I{rE8!P8F(~w=x-P9Ne`~7LlfH2M7einD4<{|SRdM}hpA z{G}<{0ljOuMaUjm$A8b0YiJBPnp1SOqIW}X(v-UZ; zAZi6ybpCqujmxvAqjhlO*)8IA=j?0K(^HpqK9#+eGcpx3Gq&<)3|S*c>vLzlqb{{b z%wNZk5PB6P--1W}E0CWZa=iD`iyvPsw;#WE4hY;TKdSMgWq$NgEiWHaL#MRRsj|QM z{@?QQSijC=ZbfIoVBPU9{&_;-2qNY>;R2YTh?}On@T)7Gnx}Y(>aKKdaS_&A=kvK- zn&M%qyAz2lfPMn9G|1}Aq|XBjH4CB=rLUumgXUJ?RKUL|j8A^^YZZ?)<3mvAWQCgR zkNKJbW5&WZeb7F2Ph@i9N?JNglOP8XMC5Teud5uxFcsn}8drsQipI4~rpiCqohMlnd23HMzQR`%td0e%!vQ zEm!s(d9dZIKYIBiXBvh+r28a0n-%#kW9pg}-T zdZfnbv!u(ufqUV5E)+keF~_QGAC3S7#gErGXKUhprInIl*HMi*T4g;b4g|&H4IG1Q H{L}Cs^;G=6 delta 872 zcmah{&1=*^6rV{Z$!2$x?RLAZ+Yel9K^DY{m1?2%CIu~BKTf8Hn3`#JOR`xf*+YA2 zk%Cv39Q4+n)=EA2@fY|92rX2QISPW_)~hGqY`ZE&aFY4GdB42hdv6~3bop)Gd}$bI z1k0H5YPU$+Jl9yp2t7wGa2xpE;3gLSMIhP7ITf!$1NJcVLb{P z^|cMIIIAIxE0~izPF!0~s`7-isU?BBpuR%PKq%qzoT1jJQZ%a@#Z~F>W#IKl|*0cCh7Fb31qc$#XcyocJlp7NI_VT%sN#N=p`6tk0-MF(Qch>425 zS^Z4S2cg^Y=>oe1+*R?jMP{=mzz-Qo!@n67 vNy^jGlhR>kYQMGbZI_Pl>@lA0N(7JZ&2$l%KOm&n`(L}+q%?Aah<<+o+HlT% diff --git a/news/pool.py b/news/pool.py index 59ce96e..7c77290 100644 --- a/news/pool.py +++ b/news/pool.py @@ -1,5 +1,7 @@ from __future__ import annotations +from dataclasses import dataclass + from playwright.async_api import async_playwright, Browser, BrowserContext, Page import asyncio @@ -11,7 +13,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple import logging def process_html(html): - return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, + return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True, include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool") @@ -86,6 +88,16 @@ class DBConnectionInfo: self.user = user self.password = password +@dataclass(frozen=True) +class ArticleParagraphs: + article_id: int + paragraphs: list[tuple[int, str]] + topics: list[str] + topics_map: dict[int, str] + paragraph_ratings: dict[int, list[tuple[int, str, bool]]] + summary: str + summary_rating: dict[int, float] + title: str = "" class ArticleRepository: """ @@ -192,13 +204,72 @@ class ArticleRepository: return False return True - async def get_latest_articles(self, count): + async def get_latest_articles(self, count, last = -1) -> list[tuple[int, str, str, str]] | None: async with self._lock: cur = self._conn.cursor() - row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,)) + if last > 0: + row = cur.execute(f"SELECT id, url, title, processed_html FROM articles WHERE id < {self.cursor_type} ORDER BY id DESC LIMIT {self.cursor_type}", (last, count)) + else: + row = cur.execute( + f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", + (count,)) return row.fetchall() + async def get_paragraphs(self, article_url : str) -> ArticleParagraphs | None: + async with self._lock: + cur = self._conn.cursor() + row = cur.execute(f"SELECT id, title FROM articles WHERE url = {self.cursor_type}", (article_url,)) + article_id, title = row.fetchone() + + if article_id is None: + return None + + row = cur.execute(f"SELECT id, paragraph_text FROM paragraphs WHERE article_id = {self.cursor_type}", (article_id,)) + + paragraphs: list[tuple[int, str]] = row.fetchall() + + row = cur.execute(f"SELECT id, topic_text FROM topics WHERE article_id = {self.cursor_type}", (article_id,)) + + topics: list[tuple[int, str]] = row.fetchall() + + topics_map = {} + for topic in topics: + topics_map[topic[0]] = topic[1] + + row = cur.execute(f"SELECT paragraph_id, topic_id, rating FROM topic_ratings WHERE topic_id IN (SELECT id FROM topics WHERE article_id = {self.cursor_type})", (article_id, )) + + topic_ratings: list[tuple[int, int, bool]] = row.fetchall() + + topic_ratings_map = {} + for paragraph_id, topic_id, rating in topic_ratings: + if not paragraph_id in topic_ratings_map: + topic_ratings_map[paragraph_id] = [] + topic_ratings_map[paragraph_id].append((topic_id, topics_map[topic_id], rating)) + + row = cur.execute(f"SELECT summary_text FROM summaries WHERE article_id = {self.cursor_type}", (article_id,)) + + summary = row.fetchone()[0] + + row = cur.execute(f"SELECT paragraph_id, rating FROM summary_ratings WHERE article_id = {self.cursor_type}", (article_id,)) + + summary_ratings = row.fetchall() + + summary_ratings_map = {} + for paragraph_id, rating in summary_ratings: + summary_ratings_map[paragraph_id] = rating + + return ArticleParagraphs( + article_id=article_id, + paragraphs=paragraphs, + topics=[topic[1] for topic in topics], + topics_map=topics_map, + paragraph_ratings=topic_ratings_map, + summary=summary, + summary_rating=summary_ratings_map, + title=title + ) + async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings): async with self._lock: article_id = self._row_for_url(url)[0] diff --git a/news/server.py b/news/server.py index 628b381..9148a45 100644 --- a/news/server.py +++ b/news/server.py @@ -1,4 +1,8 @@ +import json +import re + from quart import Quart, request, jsonify, abort, send_from_directory +import quart from pathlib import Path import logging @@ -16,17 +20,76 @@ LOGGER = logging.getLogger("server") async def index(): return await send_from_directory("static", "index.html") -@app.route("/health") +@app.route("/index.html") +async def index_html(): + return await index() + +@app.route("/view.html") +async def view_html(): + return await send_from_directory("static", "view.html") + +@app.route("/view") +async def view(): + return await view_html() + +@app.route("/api/health") async def health(): return {"status": "ok"} -@app.route("/articles/", methods=["GET"]) +@app.route("/api/article/", methods=["GET"]) async def get_article(article_url: str): article = await article_repository.get_article(article_url) if article is None: abort(404, description="Article not found") return jsonify(article) +@app.route("/api/articles", methods=["GET"]) +async def get_articles(): + count = min(int(request.args.get("count") or "25"), 125) + last = int(request.args.get("last") or "-1") + articles = await article_repository.get_latest_articles(count, last) + + json_obj = [] + for _, url, title, processed_html in articles: + json_obj.append({url: { + "title": title, + "processed_text": processed_html, + }}) + + return jsonify(json_obj) + +@app.route("/api/view_article", methods=["GET"]) +async def view_article(): + url = request.args.get("url") + if not url: + abort(400, description="`url` query parameter is required") + + article_data = await article_repository.get_paragraphs(url) + if article_data is None: + abort(404, description="Article not found") + article = { + "title": article_data.title, + "summary": article_data.summary, + "topics": article_data.topics, + "topics_map": article_data.topics_map, + "paragraphs": {} + } + for paragraph_id, paragraph_text in article_data.paragraphs: + article["paragraphs"][paragraph_id] = { + "text": paragraph_text, + "topic_ratings": [], + "summary_rating": article_data.summary_rating.get(paragraph_id) + } + + for topic_id, topic, rating in article_data.paragraph_ratings[paragraph_id]: + article["paragraphs"][paragraph_id]["topic_ratings"].append({ + "id": topic_id, + "topic": topic, + "rating": (True if (re.search("YES", rating)) else False) + }) + return jsonify(article) + + @app.route("/article-by-url", methods=["GET"]) async def get_article_by_url(): url = request.args.get("url") diff --git a/news/static/index.html b/news/static/index.html index e69de29..b57988e 100644 --- a/news/static/index.html +++ b/news/static/index.html @@ -0,0 +1,128 @@ + + + + + Article Summaries + + + + + + +
+

Newsulizer

+
+
+

Latest Article Summaries

+
+ +
+ + + + + \ No newline at end of file diff --git a/news/static/view.html b/news/static/view.html new file mode 100644 index 0000000..217ba5c --- /dev/null +++ b/news/static/view.html @@ -0,0 +1,228 @@ + + + + + Article View + + + + + + + +
+

+
+ + +
+
+
+

Article Paragraphs

+
+

Extracted from the original article

+
+
+ +
+ +
+

Article Topics (AI Generated)

+
+
    + +
    +

    Article Summary (AI Generated)

    +
    + + + + + + + \ No newline at end of file