super
parent
eb7ee3054d
commit
329ffc8c8c
Binary file not shown.
Binary file not shown.
33
news/main.py
33
news/main.py
|
@ -20,6 +20,10 @@ from ollama import Client
|
||||||
from ollama import AsyncClient
|
from ollama import AsyncClient
|
||||||
import time
|
import time
|
||||||
import json
|
import json
|
||||||
|
import server
|
||||||
|
import threading
|
||||||
|
from typing import NoReturn
|
||||||
|
|
||||||
|
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
@ -39,8 +43,6 @@ logging.basicConfig(
|
||||||
datefmt="%Y-%m-%d %H:%M:%S"
|
datefmt="%Y-%m-%d %H:%M:%S"
|
||||||
)
|
)
|
||||||
|
|
||||||
article_repository = ArticleRepository()
|
|
||||||
|
|
||||||
social_system_prompt = ("You are a specialized analysis program designed to determine if articles are pro-social. "
|
social_system_prompt = ("You are a specialized analysis program designed to determine if articles are pro-social. "
|
||||||
"Pro-social text contains topics such as raising concerns about the negative effects on workers, the environment, "
|
"Pro-social text contains topics such as raising concerns about the negative effects on workers, the environment, "
|
||||||
"or on society as a whole (as in the concerns of the 99%, or the proletariat). "
|
"or on society as a whole (as in the concerns of the 99%, or the proletariat). "
|
||||||
|
@ -194,9 +196,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info("Received URL from %s: %s", message.author, url)
|
LOGGER.info("Received URL from %s: %s", message.author, url)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
title, processed_html = await article_repository.get_article(url)
|
title, processed_html = await server.article_repository.get_article(url)
|
||||||
|
|
||||||
if await article_repository.has_paragraphs(url):
|
if await server.article_repository.has_paragraphs(url):
|
||||||
await message.channel.send("This article has already been processed.")
|
await message.channel.send("This article has already been processed.")
|
||||||
LOGGER.info(f"Article {url} already processed")
|
LOGGER.info(f"Article {url} already processed")
|
||||||
return
|
return
|
||||||
|
@ -213,15 +215,19 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
|
|
||||||
summary_bot.set_system("You are a specialized analysis program designed to summarize articles into their key points.\n "
|
summary_bot.set_system("You are a specialized analysis program designed to summarize articles into their key points.\n "
|
||||||
"You WILL output as many key points as possible, but you MUST output at least 1 key point.\n"
|
"You WILL output as many key points as possible, but you MUST output at least 1 key point.\n"
|
||||||
"You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. ")
|
"You WILL only output a JSON list of key points, structured as {key_points: [\"keypoint1\", \"keypoint2\",...]}. "
|
||||||
|
"DO NOT OUTPUT ANYTHING BUT THE PROPERLY FORMATTED JSON.")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in await summary_bot.multi_summary(processed_html, options={
|
silly = await summary_bot.multi_summary(processed_html, options={
|
||||||
"temperature": 0.5,
|
"temperature": 0.5,
|
||||||
"num_ctx": 4096
|
"num_ctx": 4096
|
||||||
})) for item in sublist]
|
}, format="json")
|
||||||
|
keywords = [item for sublist in (json.loads(sumr.content())["key_points"] for sumr in silly) for item in sublist]
|
||||||
LOGGER.info(keywords)
|
LOGGER.info(keywords)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
|
for sil in silly:
|
||||||
|
LOGGER.error(sil.content())
|
||||||
LOGGER.error("Failed to correctly parse LLM output. It is likely that it has failed.")
|
LOGGER.error("Failed to correctly parse LLM output. It is likely that it has failed.")
|
||||||
LOGGER.error(exc, exc_info=True)
|
LOGGER.error(exc, exc_info=True)
|
||||||
keywords = []
|
keywords = []
|
||||||
|
@ -257,6 +263,7 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
LOGGER.info(f"Running for keyword {keypoint} got response {keypoint_is_rev.content()}")
|
LOGGER.info(f"Running for keyword {keypoint} got response {keypoint_is_rev.content()}")
|
||||||
restitutions.append(keypoint_is_rev.content())
|
restitutions.append(keypoint_is_rev.content())
|
||||||
restitutions.append(res.content())
|
restitutions.append(res.content())
|
||||||
|
LOGGER.info(f"Restitutions: {restitutions}")
|
||||||
yes, no, err = tally_responses(restitutions)
|
yes, no, err = tally_responses(restitutions)
|
||||||
total = yes + no + err
|
total = yes + no + err
|
||||||
paragraph_relevance.append(response.content())
|
paragraph_relevance.append(response.content())
|
||||||
|
@ -266,9 +273,9 @@ async def handle_article_url(message: discord.Message, url: str) -> None:
|
||||||
for i, x in enumerate(paragraph_relevance):
|
for i, x in enumerate(paragraph_relevance):
|
||||||
paragraph_relevance[i] = int(x)
|
paragraph_relevance[i] = int(x)
|
||||||
|
|
||||||
await article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions)
|
await server.article_repository.set_paragraphs(url, paragraphs, summary, paragraph_relevance, keywords, paragraph_restitutions)
|
||||||
|
|
||||||
average_relevance = (sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance) + sum(paragraph_keypoints)) / 2
|
average_relevance = ((sum(int(x) for x in paragraph_relevance) / len(paragraph_relevance)) + (sum(paragraph_keypoints) / len(paragraph_keypoints))) / 2
|
||||||
median_relevance = sorted(int(ref) for ref in paragraph_relevance)[len(paragraph_relevance) // 2]
|
median_relevance = sorted(int(ref) for ref in paragraph_relevance)[len(paragraph_relevance) // 2]
|
||||||
median_relevance2 = sorted(paragraph_keypoints)[len(paragraph_keypoints) // 2]
|
median_relevance2 = sorted(paragraph_keypoints)[len(paragraph_keypoints) // 2]
|
||||||
|
|
||||||
|
@ -340,14 +347,22 @@ async def on_message(message: discord.Message) -> None:
|
||||||
# Launch the processing task without blocking Discord’s event loop
|
# Launch the processing task without blocking Discord’s event loop
|
||||||
asyncio.create_task(handle_article_url(message, url))
|
asyncio.create_task(handle_article_url(message, url))
|
||||||
|
|
||||||
|
def _run_flask_blocking() -> NoReturn: # helper returns never
|
||||||
|
server.app.run(host="0.0.0.0", port=8000, debug=False, use_reloader=False)
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
if DISCORD_TOKEN is None:
|
if DISCORD_TOKEN is None:
|
||||||
raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")
|
raise RuntimeError("Set the DISCORD_TOKEN environment variable or add it to a .env file.")
|
||||||
|
|
||||||
|
thread = threading.Thread(target=_run_flask_blocking, daemon=True, name="flask-api")
|
||||||
|
thread.start()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
bot.run(DISCORD_TOKEN)
|
bot.run(DISCORD_TOKEN)
|
||||||
finally:
|
finally:
|
||||||
asyncio.run(PlaywrightPool.stop())
|
asyncio.run(PlaywrightPool.stop())
|
||||||
|
server.article_repository.close()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
10
news/pool.py
10
news/pool.py
|
@ -11,7 +11,7 @@ from typing import Final, Optional, Union, Protocol, Any, Tuple
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
def process_html(html):
|
def process_html(html):
|
||||||
return trafilatura.extract(html, output_format='txt', include_images=True, include_formatting=True,
|
return trafilatura.extract(html, output_format='markdown', include_images=True, include_formatting=True,
|
||||||
include_tables=True, include_comments=False, favor_recall=True)
|
include_tables=True, include_comments=False, favor_recall=True)
|
||||||
|
|
||||||
LOGGER = logging.getLogger("pool")
|
LOGGER = logging.getLogger("pool")
|
||||||
|
@ -186,7 +186,7 @@ class ArticleRepository:
|
||||||
row = cur.execute(f"SELECT COUNT(*) FROM summaries WHERE article_id = {row[0]}")
|
row = cur.execute(f"SELECT COUNT(*) FROM summaries WHERE article_id = {row[0]}")
|
||||||
|
|
||||||
result = row.fetchone()
|
result = row.fetchone()
|
||||||
if not row.fetchone() or row.fetchone()[0] == 0:
|
if not result or result[0] == 0:
|
||||||
return False
|
return False
|
||||||
return True
|
return True
|
||||||
|
|
||||||
|
@ -209,10 +209,10 @@ class ArticleRepository:
|
||||||
topic_ids = []
|
topic_ids = []
|
||||||
for topic in topics:
|
for topic in topics:
|
||||||
rows = cur.execute(f"""
|
rows = cur.execute(f"""
|
||||||
INSERT INTO topics (article_id, topic_text, type)
|
INSERT INTO topics (article_id, topic_text)
|
||||||
VALUES ({self.cursor_type}, {self.cursor_type}, {self.cursor_type})
|
VALUES ({self.cursor_type}, {self.cursor_type})
|
||||||
RETURNING id;
|
RETURNING id;
|
||||||
""", (article_id, topic, "keypoint"))
|
""", (article_id, topic))
|
||||||
topic_ids.append(rows.fetchone()[0])
|
topic_ids.append(rows.fetchone()[0])
|
||||||
|
|
||||||
for paragraph, summary_rating, gel in zip(paragraphs, summary_ratings, topic_ratings):
|
for paragraph, summary_rating, gel in zip(paragraphs, summary_ratings, topic_ratings):
|
||||||
|
|
|
@ -0,0 +1,37 @@
|
||||||
|
from flask import Flask, request, jsonify, abort
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
# Import the repository class from the existing code base.
|
||||||
|
# Adjust the relative import path if pool.py lives in a package.
|
||||||
|
from pool import ArticleRepository
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
article_repository = ArticleRepository()
|
||||||
|
|
||||||
|
@app.route("/articles/<article_url>", methods=["GET"])
|
||||||
|
def get_article(article_url: str):
|
||||||
|
"""
|
||||||
|
Fetch one article by its numeric primary key.
|
||||||
|
Responds with the whole row in JSON or 404 if not present.
|
||||||
|
"""
|
||||||
|
article = article_repository.get_article(article_url)
|
||||||
|
if article is None:
|
||||||
|
abort(404, description="Article not found")
|
||||||
|
return jsonify(article)
|
||||||
|
|
||||||
|
@app.route("/article-by-url", methods=["GET"])
|
||||||
|
def get_article_by_url():
|
||||||
|
"""
|
||||||
|
Same as above but lets a client specify the canonical URL instead of the ID:
|
||||||
|
|
||||||
|
GET /article-by-url?url=https://example.com/foo
|
||||||
|
"""
|
||||||
|
url = request.args.get("url")
|
||||||
|
if not url:
|
||||||
|
abort(400, description="`url` query parameter is required")
|
||||||
|
|
||||||
|
article = await article_repository.get_article(url)
|
||||||
|
if article is None:
|
||||||
|
abort(404, description="Article not found")
|
||||||
|
return jsonify(article)
|
|
@ -9,6 +9,7 @@ in pkgs.mkShell {
|
||||||
python-dotenv
|
python-dotenv
|
||||||
trafilatura
|
trafilatura
|
||||||
playwright
|
playwright
|
||||||
|
flask
|
||||||
]))
|
]))
|
||||||
];
|
];
|
||||||
propagatedBuildInputs = with pkgs; [
|
propagatedBuildInputs = with pkgs; [
|
||||||
|
@ -48,4 +49,4 @@ in pkgs.mkShell {
|
||||||
export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers}
|
export PLAYWRIGHT_BROWSERS_PATH=${pkgs.playwright-driver.browsers}
|
||||||
export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true
|
export PLAYWRIGHT_SKIP_VALIDATE_HOST_REQUIREMENTS=true
|
||||||
'';
|
'';
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue