comes together

main
Brett 2025-07-06 22:12:15 -04:00
parent d3927f5bca
commit 53e22c8465
6 changed files with 495 additions and 5 deletions

View File

@ -1,5 +1,7 @@
from __future__ import annotations
from dataclasses import dataclass
from playwright.async_api import async_playwright, Browser, BrowserContext, Page
import asyncio
@ -11,7 +13,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple
import logging
def process_html(html):
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True,
include_tables=True, include_comments=False, favor_recall=True)
LOGGER = logging.getLogger("pool")
@ -86,6 +88,16 @@ class DBConnectionInfo:
self.user = user
self.password = password
@dataclass(frozen=True)
class ArticleParagraphs:
article_id: int
paragraphs: list[tuple[int, str]]
topics: list[str]
topics_map: dict[int, str]
paragraph_ratings: dict[int, list[tuple[int, str, bool]]]
summary: str
summary_rating: dict[int, float]
title: str = ""
class ArticleRepository:
"""
@ -192,13 +204,72 @@ class ArticleRepository:
return False
return True
async def get_latest_articles(self, count):
async def get_latest_articles(self, count, last = -1) -> list[tuple[int, str, str, str]] | None:
async with self._lock:
cur = self._conn.cursor()
row = cur.execute(f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}", (count,))
if last > 0:
row = cur.execute(f"SELECT id, url, title, processed_html FROM articles WHERE id < {self.cursor_type} ORDER BY id DESC LIMIT {self.cursor_type}", (last, count))
else:
row = cur.execute(
f"SELECT id, url, title, processed_html FROM articles ORDER BY id DESC LIMIT {self.cursor_type}",
(count,))
return row.fetchall()
async def get_paragraphs(self, article_url : str) -> ArticleParagraphs | None:
async with self._lock:
cur = self._conn.cursor()
row = cur.execute(f"SELECT id, title FROM articles WHERE url = {self.cursor_type}", (article_url,))
article_id, title = row.fetchone()
if article_id is None:
return None
row = cur.execute(f"SELECT id, paragraph_text FROM paragraphs WHERE article_id = {self.cursor_type}", (article_id,))
paragraphs: list[tuple[int, str]] = row.fetchall()
row = cur.execute(f"SELECT id, topic_text FROM topics WHERE article_id = {self.cursor_type}", (article_id,))
topics: list[tuple[int, str]] = row.fetchall()
topics_map = {}
for topic in topics:
topics_map[topic[0]] = topic[1]
row = cur.execute(f"SELECT paragraph_id, topic_id, rating FROM topic_ratings WHERE topic_id IN (SELECT id FROM topics WHERE article_id = {self.cursor_type})", (article_id, ))
topic_ratings: list[tuple[int, int, bool]] = row.fetchall()
topic_ratings_map = {}
for paragraph_id, topic_id, rating in topic_ratings:
if not paragraph_id in topic_ratings_map:
topic_ratings_map[paragraph_id] = []
topic_ratings_map[paragraph_id].append((topic_id, topics_map[topic_id], rating))
row = cur.execute(f"SELECT summary_text FROM summaries WHERE article_id = {self.cursor_type}", (article_id,))
summary = row.fetchone()[0]
row = cur.execute(f"SELECT paragraph_id, rating FROM summary_ratings WHERE article_id = {self.cursor_type}", (article_id,))
summary_ratings = row.fetchall()
summary_ratings_map = {}
for paragraph_id, rating in summary_ratings:
summary_ratings_map[paragraph_id] = rating
return ArticleParagraphs(
article_id=article_id,
paragraphs=paragraphs,
topics=[topic[1] for topic in topics],
topics_map=topics_map,
paragraph_ratings=topic_ratings_map,
summary=summary,
summary_rating=summary_ratings_map,
title=title
)
async def set_paragraphs(self, url, paragraphs, summary, summary_ratings, topics, topic_ratings):
async with self._lock:
article_id = self._row_for_url(url)[0]

View File

@ -1,4 +1,8 @@
import json
import re
from quart import Quart, request, jsonify, abort, send_from_directory
import quart
from pathlib import Path
import logging
@ -16,17 +20,76 @@ LOGGER = logging.getLogger("server")
async def index():
return await send_from_directory("static", "index.html")
@app.route("/health")
@app.route("/index.html")
async def index_html():
return await index()
@app.route("/view.html")
async def view_html():
return await send_from_directory("static", "view.html")
@app.route("/view")
async def view():
return await view_html()
@app.route("/api/health")
async def health():
return {"status": "ok"}
@app.route("/articles/<article_url>", methods=["GET"])
@app.route("/api/article/<path:article_url>", methods=["GET"])
async def get_article(article_url: str):
article = await article_repository.get_article(article_url)
if article is None:
abort(404, description="Article not found")
return jsonify(article)
@app.route("/api/articles", methods=["GET"])
async def get_articles():
count = min(int(request.args.get("count") or "25"), 125)
last = int(request.args.get("last") or "-1")
articles = await article_repository.get_latest_articles(count, last)
json_obj = []
for _, url, title, processed_html in articles:
json_obj.append({url: {
"title": title,
"processed_text": processed_html,
}})
return jsonify(json_obj)
@app.route("/api/view_article", methods=["GET"])
async def view_article():
url = request.args.get("url")
if not url:
abort(400, description="`url` query parameter is required")
article_data = await article_repository.get_paragraphs(url)
if article_data is None:
abort(404, description="Article not found")
article = {
"title": article_data.title,
"summary": article_data.summary,
"topics": article_data.topics,
"topics_map": article_data.topics_map,
"paragraphs": {}
}
for paragraph_id, paragraph_text in article_data.paragraphs:
article["paragraphs"][paragraph_id] = {
"text": paragraph_text,
"topic_ratings": [],
"summary_rating": article_data.summary_rating.get(paragraph_id)
}
for topic_id, topic, rating in article_data.paragraph_ratings[paragraph_id]:
article["paragraphs"][paragraph_id]["topic_ratings"].append({
"id": topic_id,
"topic": topic,
"rating": (True if (re.search("YES", rating)) else False)
})
return jsonify(article)
@app.route("/article-by-url", methods=["GET"])
async def get_article_by_url():
url = request.args.get("url")

View File

@ -0,0 +1,128 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Article Summaries</title>
<!-- Tiny bit of styling so it already looks decent -->
<style>
body{
font-family: Arial, sans-serif;
margin: 0;
padding: 0 1rem;
background:#f6f8fa;
}
h1{color:#333;text-align:center;margin-top:1.5rem;}
#articles{
display:flex;
flex-direction:column;
gap:1rem;
max-width:800px;
margin:2rem auto;
}
.article-card{
background:#fff;
border-radius:6px;
padding:1rem 1.5rem;
box-shadow:0 1px 3px rgba(0,0,0,.08);
}
.article-card h2{
margin:.2rem 0 .6rem;
font-size:1.2rem;
}
.article-card p{
color:#444;
margin:0;
}
.article-link .article-card{
transition: transform .15s ease, box-shadow .15s ease;
}
.article-link:hover .article-card,
.article-link:focus-visible .article-card{
transform: translateY(-4px) scale(1.02);
box-shadow: 0 6px 14px rgba(0,0,0,.16);
}
.article-link,
.article-link:visited,
.article-link:hover,
.article-link:active,
.article-link:focus {
text-decoration: none;
color: inherit; /* keep the original text color */
}
.error{
color:#c00;
text-align:center;
margin-top:2rem;
}
</style>
</head>
<body>
<div style="display: flex; justify-content: center;">
<h1>Newsulizer</h1>
</div>
<div style="display: flex; justify-content: center;">
<h2>Latest Article Summaries</h2>
</div>
<section id="articles" aria-live="polite"></section>
<p id="error" class="error" hidden>Unable to load articles. Please try again later.</p>
<script type="text/javascript">
(function loadArticles() {
const container = document.getElementById('articles');
const errorEl = document.getElementById('error');
// Change this to the full path of your API if it differs
const ENDPOINT = '/api/articles?count=25';
fetch(ENDPOINT)
.then(res => {
if (!res.ok) throw new Error(`HTTP ${res.status}`);
return res.json();
})
.then(data => {
if (!Array.isArray(data) || data.length === 0) {
throw new Error('No data returned');
}
data.forEach(item => {
const [url, meta] = Object.entries(item)[0];
/* Build <a><article>… inside …</article></a> so
the whole card is a single coherent link */
const link = document.createElement('a');
link.className = 'article-link';
link.href = '/view?url=' + encodeURIComponent(url);
const card = document.createElement('article');
card.className = 'article-card';
const h2 = document.createElement('h2');
h2.textContent = meta.title || url;
const p = document.createElement('p');
p.textContent = truncate(meta.processed_text, 280);
card.appendChild(h2);
card.appendChild(p);
link.appendChild(card);
container.appendChild(link);
});
})
.catch(err => {
console.error(err);
errorEl.hidden = false;
});
// Helper to shorten long text without breaking words mid-sentence
function truncate(str, maxLen) {
if (typeof str !== 'string') return '';
return str.length > maxLen ? str.slice(0, maxLen).trimEnd() + '…' : str;
}
})();
</script>
</body>
</html>

228
news/static/view.html Normal file
View File

@ -0,0 +1,228 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>Article View</title>
<style>
:root{
/* future per-paragraph background color */
--card-bg: #ffffff;
}
/* ─────────── Global title bar ─────────── */
.navbar{
position:fixed;
top:0; left:0; right:0;
height:3rem;
background:#0d47a1;
color:#fff;
display:flex;
align-items:center;
justify-content:space-between;
padding:0 1rem;
box-shadow:0 1px 4px rgba(0,0,0,.15);
z-index:100;
}
.navbar .brand{
font-size:1.15rem;
font-weight:700;
color:#fff;
text-decoration:none;
}
.navbar .nav-link{
color:#fff;
text-decoration:none;
margin-left:1rem;
font-size:.95rem;
}
.navbar .nav-link:hover{
text-decoration:underline;
}
body{
font-family: Arial, sans-serif;
margin:0;
padding: 4rem 1rem 2rem;
background:#f6f8fa;
color:#222;
}
h1{
text-align:center;
margin:1.5rem 0 1rem;
font-size:1.6rem;
}
a,
a:visited,
a:hover,
a:active,
a:focus {
/*text-decoration: none;*/
color: inherit; /* keep the original text color */
}
/* TOPICS GRID ------------------------------------------------------- */
#topics{
display:grid;
grid-template-columns:repeat(auto-fill,minmax(220px,1fr));
gap:.75rem;
max-width:1200px;
margin:0 auto 2rem;
list-style:none;
padding:0;
}
.topic-chip{
background:#e8eef6;
border-radius:4px;
padding:.6rem .8rem;
font-size:.9rem;
line-height:1.3;
}
/* SUMMARY BLOB ------------------------------------------------------ */
#summary{
background:#fff8e6;
border-left:5px solid #f5c147;
border-radius:4px;
padding:1rem 1.2rem;
max-width:900px;
margin:0 auto 2rem;
font-size:1rem;
}
/* PARAGRAPH CARDS --------------------------------------------------- */
#paragraphs{
display:flex;
flex-direction:column;
gap:1rem;
max-width:900px;
margin:0 auto;
}
.paragraph-card{
background:var(--card-bg);
border-radius:6px;
padding:1rem 1.2rem;
box-shadow:0 1px 3px rgba(0,0,0,.08);
transition:box-shadow .15s ease;
}
.paragraph-card:hover{
box-shadow:0 3px 8px rgba(0,0,0,.14);
}
.error{
color:#c00;
text-align:center;
margin-top:2rem;
}
</style>
</head>
<body>
<header class="navbar">
<a href="/" class="brand">Newsulizer</a>
<nav>
<a href="/" class="nav-link">Home</a>
</nav>
</header>
<div style="display: flex; justify-content: center;">
<h1 id="title"><b></b></h1>
</div>
<div style="display: flex; justify-content: center;">
<div style="display: block; padding-bottom: 1rem;">
<div style="display: flex; justify-content: center;">
<h2>Article Paragraphs</h2>
</div>
<p>Extracted from the original article</p>
</div>
</div>
<section id="paragraphs" aria-label="Article paragraphs"></section>
<div style="display: flex; justify-content: center;">
<h2>Article Topics (AI Generated)</h2>
</div>
<ul id="topics" aria-label="Article topics"></ul>
<div style="display: flex; justify-content: center;">
<h2>Article Summary (AI Generated)</h2>
</div>
<section id="summary" hidden></section>
<p id="error" class="error" hidden>Unable to load article. Please try again later.</p>
<script type="text/javascript">
(function main(){
const qs = new URLSearchParams(window.location.search);
const url = qs.get('url');
const API = '/api/view_article?url=' + encodeURIComponent(url ?? '');
const elTopics = document.getElementById('topics');
const elSummary = document.getElementById('summary');
const elParagraphs = document.getElementById('paragraphs');
const elError = document.getElementById('error');
const elTitle = document.getElementById('title');
if(!url){
elError.hidden = false;
elError.textContent = '`url` query parameter missing.';
return;
}
fetch(API)
.then(r => {
if(!r.ok) throw new Error(`HTTP ${r.status}`);
return r.json();
})
.then(renderArticle)
.catch(err => {
console.error(err);
elError.hidden = false;
});
/* ---------------------------------------------------------------- */
function renderArticle(data){
elTitle.innerHTML = "<a href='" + url + "'>\"" + data.title + "\"</a>";
/* 1. TOPICS -------------------------------------------------- */
if(Array.isArray(data.topics)){
data.topics.forEach(topic => {
const li = document.createElement('li');
li.className = 'topic-chip';
li.textContent = topic;
elTopics.appendChild(li);
});
}
/* 2. SUMMARY ------------------------------------------------- */
if(typeof data.summary === 'string' && data.summary.trim() !== ''){
elSummary.textContent = data.summary.trim();
elSummary.hidden = false;
}
/* 3. PARAGRAPHS --------------------------------------------- */
Object.entries(data.paragraphs ?? {}).forEach(([pid, pData]) => {
const card = document.createElement('div');
card.className = 'paragraph-card';
/* store ratings for future use */
card.dataset.summaryRating = pData.summary_rating ?? '';
card.dataset.topicRatings = JSON.stringify(
(pData.topic_ratings ?? []).map(r => !!r.rating)
);
const p = document.createElement('p');
p.textContent = pData.text;
card.appendChild(p);
elParagraphs.appendChild(card);
});
}
})();
</script>
</body>
</html>