diff --git a/news/__pycache__/pool.cpython-311.pyc b/news/__pycache__/pool.cpython-311.pyc index 803a77a..acbcadf 100644 Binary files a/news/__pycache__/pool.cpython-311.pyc and b/news/__pycache__/pool.cpython-311.pyc differ diff --git a/news/__pycache__/server.cpython-311.pyc b/news/__pycache__/server.cpython-311.pyc index 935bcde..1e2f024 100644 Binary files a/news/__pycache__/server.cpython-311.pyc and b/news/__pycache__/server.cpython-311.pyc differ diff --git a/news/pool.py b/news/pool.py index 59ce96e..7c77290 100644 --- a/news/pool.py +++ b/news/pool.py @@ -1,5 +1,7 @@ from __future__ import annotations +from dataclasses import dataclass + from playwright.async_api import async_playwright, Browser, BrowserContext, Page import asyncio @@ -11,7 +13,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple import logging def process_html(html): - return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True, + return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True, include_tables=True, include_comments=False, favor_recall=True) LOGGER = logging.getLogger("pool") @@ -86,6 +88,16 @@ class DBConnectionInfo: self.user = user self.password = password +@dataclass(frozen=True) +class ArticleParagraphs: + article_id: int + paragraphs: list[tuple[int, str]] + topics: list[str] + topics_map: dict[int, str] + paragraph_ratings: dict[int, list[tuple[int, str, bool]]] + summary: str + summary_rating: dict[int, float] + title: str = "" class ArticleRepository: """ @@ -192,13 +204,72 @@ class ArticleRepository: return False return True - async def get_latest_articles(self, count): + async def get_latest_articles(self, count, last = -1) -> list[tuple[int, str, str, str]] | None: async with self._lock: cur = self._conn.cursor() - row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,)) + if last > 0: + row = cur.execute(f"SELECT id, url, title, processed_html FROM articles WHERE id < {self.cursor_type} ORDER BY id DESC LIMIT {self.cursor_type}", (last, count)) + else: + row = cur.execute( + f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", + (count,)) return row.fetchall() + async def get_paragraphs(self, article_url : str) -> ArticleParagraphs | None: + async with self._lock: + cur = self._conn.cursor() + row = cur.execute(f"SELECT id, title FROM articles WHERE url = {self.cursor_type}", (article_url,)) + article_id, title = row.fetchone() + + if article_id is None: + return None + + row = cur.execute(f"SELECT id, paragraph_text FROM paragraphs WHERE article_id = {self.cursor_type}", (article_id,)) + + paragraphs: list[tuple[int, str]] = row.fetchall() + + row = cur.execute(f"SELECT id, topic_text FROM topics WHERE article_id = {self.cursor_type}", (article_id,)) + + topics: list[tuple[int, str]] = row.fetchall() + + topics_map = {} + for topic in topics: + topics_map[topic[0]] = topic[1] + + row = cur.execute(f"SELECT paragraph_id, topic_id, rating FROM topic_ratings WHERE topic_id IN (SELECT id FROM topics WHERE article_id = {self.cursor_type})", (article_id, )) + + topic_ratings: list[tuple[int, int, bool]] = row.fetchall() + + topic_ratings_map = {} + for paragraph_id, topic_id, rating in topic_ratings: + if not paragraph_id in topic_ratings_map: + topic_ratings_map[paragraph_id] = [] + topic_ratings_map[paragraph_id].append((topic_id, topics_map[topic_id], rating)) + + row = cur.execute(f"SELECT summary_text FROM summaries WHERE article_id = {self.cursor_type}", (article_id,)) + + summary = row.fetchone()[0] + + row = cur.execute(f"SELECT paragraph_id, rating FROM summary_ratings WHERE article_id = {self.cursor_type}", (article_id,)) + + summary_ratings = row.fetchall() + + summary_ratings_map = {} + for paragraph_id, rating in summary_ratings: + summary_ratings_map[paragraph_id] = rating + + return ArticleParagraphs( + article_id=article_id, + paragraphs=paragraphs, + topics=[topic[1] for topic in topics], + topics_map=topics_map, + paragraph_ratings=topic_ratings_map, + summary=summary, + summary_rating=summary_ratings_map, + title=title + ) + async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings): async with self._lock: article_id = self._row_for_url(url)[0] diff --git a/news/server.py b/news/server.py index 628b381..9148a45 100644 --- a/news/server.py +++ b/news/server.py @@ -1,4 +1,8 @@ +import json +import re + from quart import Quart, request, jsonify, abort, send_from_directory +import quart from pathlib import Path import logging @@ -16,17 +20,76 @@ LOGGER = logging.getLogger("server") async def index(): return await send_from_directory("static", "index.html") -@app.route("/health") +@app.route("/index.html") +async def index_html(): + return await index() + +@app.route("/view.html") +async def view_html(): + return await send_from_directory("static", "view.html") + +@app.route("/view") +async def view(): + return await view_html() + +@app.route("/api/health") async def health(): return {"status": "ok"} -@app.route("/articles/", methods=["GET"]) +@app.route("/api/article/", methods=["GET"]) async def get_article(article_url: str): article = await article_repository.get_article(article_url) if article is None: abort(404, description="Article not found") return jsonify(article) +@app.route("/api/articles", methods=["GET"]) +async def get_articles(): + count = min(int(request.args.get("count") or "25"), 125) + last = int(request.args.get("last") or "-1") + articles = await article_repository.get_latest_articles(count, last) + + json_obj = [] + for _, url, title, processed_html in articles: + json_obj.append({url: { + "title": title, + "processed_text": processed_html, + }}) + + return jsonify(json_obj) + +@app.route("/api/view_article", methods=["GET"]) +async def view_article(): + url = request.args.get("url") + if not url: + abort(400, description="`url` query parameter is required") + + article_data = await article_repository.get_paragraphs(url) + if article_data is None: + abort(404, description="Article not found") + article = { + "title": article_data.title, + "summary": article_data.summary, + "topics": article_data.topics, + "topics_map": article_data.topics_map, + "paragraphs": {} + } + for paragraph_id, paragraph_text in article_data.paragraphs: + article["paragraphs"][paragraph_id] = { + "text": paragraph_text, + "topic_ratings": [], + "summary_rating": article_data.summary_rating.get(paragraph_id) + } + + for topic_id, topic, rating in article_data.paragraph_ratings[paragraph_id]: + article["paragraphs"][paragraph_id]["topic_ratings"].append({ + "id": topic_id, + "topic": topic, + "rating": (True if (re.search("YES", rating)) else False) + }) + return jsonify(article) + + @app.route("/article-by-url", methods=["GET"]) async def get_article_by_url(): url = request.args.get("url") diff --git a/news/static/index.html b/news/static/index.html index e69de29..b57988e 100644 --- a/news/static/index.html +++ b/news/static/index.html @@ -0,0 +1,128 @@ + + + + + Article Summaries + + + + + + +
+

Newsulizer

+
+
+

Latest Article Summaries

+
+ +
+ + + + + \ No newline at end of file diff --git a/news/static/view.html b/news/static/view.html new file mode 100644 index 0000000..217ba5c --- /dev/null +++ b/news/static/view.html @@ -0,0 +1,228 @@ + + + + + Article View + + + + + + + +
+

+
+ + +
+
+
+

Article Paragraphs

+
+

Extracted from the original article

+
+
+ +
+ +
+

Article Topics (AI Generated)

+
+ + +
+

Article Summary (AI Generated)

+
+ + + + + + + \ No newline at end of file