newsminimalist RSS pipeline: browserless scraper + server + Chinese translation
This commit is contained in:
commit
2623da6b7e
40
README.md
Normal file
40
README.md
Normal file
@ -0,0 +1,40 @@
|
||||
# NewsMinimalist RSS
|
||||
|
||||
从 [newsminimalist.com](https://www.newsminimalist.com) 抓取 Gemini AI 评分的新闻,生成 RSS Feed + HTML 页面。
|
||||
|
||||
## 架构
|
||||
|
||||
```
|
||||
browserless (Chrome headless)
|
||||
↓ /content API
|
||||
scraper.py → 解析 <details> → 提取 [score] title (source) link
|
||||
↓ JSON 缓存
|
||||
server.py → RSS 2.0 + HTML
|
||||
↓ NPM 反代
|
||||
https://rsshub.arabiancloud.online/newsminimalist
|
||||
```
|
||||
|
||||
## 文件
|
||||
|
||||
| 文件 | 说明 |
|
||||
|:-----|:-----|
|
||||
| `scraper.py` | 爬虫:browserless 渲染 → BeautifulSoup 解析 → Google 翻译中文 → JSON 缓存 |
|
||||
| `server.py` | 服务:读 JSON 缓存 → 输出 RSS 2.0 + Atom + HTML |
|
||||
|
||||
## 部署
|
||||
|
||||
```bash
|
||||
# 1. 拉取浏览器镜像
|
||||
docker pull browserless/chrome
|
||||
|
||||
# 2. 构建并运行
|
||||
docker build -t newsminimalist-rss .
|
||||
docker run -d --name newsminimalist-rss -p 1202:1202 \
|
||||
--network rsshub_default \
|
||||
-v /root/news_cache.json:/root/news_cache.json \
|
||||
newsminimalist-rss
|
||||
|
||||
# 3. 定时抓取(建议 UTC 02:50, 14:50)
|
||||
crontab -e
|
||||
50 2,14 * * * docker exec newsminimalist-rss python3 /app/scraper.py
|
||||
```
|
||||
103
scraper.py
Normal file
103
scraper.py
Normal file
@ -0,0 +1,103 @@
|
||||
#!/usr/bin/env python3
|
||||
"""newsminimalist-scraper — minimal working + Chinese translation"""
|
||||
import json, re, time, urllib.parse
|
||||
from datetime import datetime, timezone
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
CACHE_FILE = '/root/news_cache.json'
|
||||
|
||||
|
||||
def translate(text):
|
||||
if not text: return ''
|
||||
try:
|
||||
url = f'https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q={urllib.parse.quote(text[:500])}'
|
||||
r = requests.get(url, timeout=5)
|
||||
if r.status_code == 200:
|
||||
result = r.json()
|
||||
if result and result[0]:
|
||||
return ''.join(p[0] for p in result[0] if p[0])
|
||||
except: pass
|
||||
return text
|
||||
|
||||
|
||||
# === Main scrape logic (verified working) ===
|
||||
r = requests.post('http://browserless:3000/content', json={'url': 'https://www.newsminimalist.com/'}, timeout=60)
|
||||
soup = BeautifulSoup(r.text, 'lxml')
|
||||
|
||||
articles = []
|
||||
for d in soup.find_all('details'):
|
||||
summary = d.find('summary')
|
||||
if not summary: continue
|
||||
score_span = summary.find('span', title=re.compile('Significance'))
|
||||
if not score_span: continue
|
||||
mr = summary.find('div', class_=re.compile('mr-auto'))
|
||||
if not mr: continue
|
||||
|
||||
spans = mr.find_all('span', recursive=False)
|
||||
title = ''
|
||||
source = ''
|
||||
link = ''
|
||||
|
||||
for s in spans:
|
||||
cls = ' '.join(s.get('class', []))
|
||||
if 'inline-block' in cls:
|
||||
src_text = s.get_text(strip=True).lstrip('(').rstrip(')')
|
||||
m = re.match(r'([^\s+]+)', src_text)
|
||||
if m: source = m.group(1)
|
||||
else:
|
||||
if not title: title = s.get_text(strip=True)
|
||||
|
||||
if not title: continue
|
||||
|
||||
for a in d.find_all('a', href=True):
|
||||
href = a['href']
|
||||
if href.startswith('http') and 'newsminimalist.com' not in href:
|
||||
link = href
|
||||
break
|
||||
|
||||
score_text = score_span.get_text(strip=True)
|
||||
sm = re.search(r'(\d+\.?\d*)', score_text)
|
||||
score = float(sm.group(1)) if sm else 0
|
||||
|
||||
articles.append({
|
||||
'title': title,
|
||||
'link': link or 'https://www.newsminimalist.com',
|
||||
'score': score,
|
||||
'source': source,
|
||||
'summary': '',
|
||||
'title_zh': '',
|
||||
})
|
||||
|
||||
# Dedup & sort
|
||||
seen = set()
|
||||
unique = [a for a in articles if not (a['title'][:80] in seen or seen.add(a['title'][:80]))]
|
||||
unique.sort(key=lambda a: a['score'], reverse=True)
|
||||
|
||||
print(f'Scraped {len(unique)} articles')
|
||||
|
||||
# Chinese translation (top 30)
|
||||
for a in unique:
|
||||
try:
|
||||
a['title_zh'] = translate(a['title'])
|
||||
time.sleep(0.15)
|
||||
except:
|
||||
a['title_zh'] = a['title']
|
||||
|
||||
translated = sum(1 for a in unique[:30] if a.get('title_zh'))
|
||||
print(f' +{translated} Chinese translations')
|
||||
|
||||
data = {
|
||||
'date': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
|
||||
'updated': datetime.now(timezone.utc).isoformat(),
|
||||
'count': len(unique),
|
||||
'news': unique,
|
||||
}
|
||||
|
||||
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
for a in unique[:3]:
|
||||
zh = a.get('title_zh', '')
|
||||
print(f' [{a["score"]}] {a["title"][:70]}')
|
||||
if zh: print(f' 🇨🇳 {zh[:70]}')
|
||||
209
server.py
Normal file
209
server.py
Normal file
@ -0,0 +1,209 @@
|
||||
#!/usr/bin/env python3
|
||||
"""News Minimalist RSS Server — serves RSS/HTML from scraped cache"""
|
||||
import http.server
|
||||
import socketserver
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from collections import defaultdict
|
||||
|
||||
PORT = 1202
|
||||
CACHE_FILE = '/root/news_cache.json'
|
||||
|
||||
|
||||
def load_cache():
|
||||
try:
|
||||
with open(CACHE_FILE, 'r', encoding='utf-8') as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
return {'news': [], 'date': 'unknown', 'count': 0}
|
||||
|
||||
|
||||
def generate_rss(data):
|
||||
"""Generate RSS 2.0 XML with Atom self-link."""
|
||||
news = data.get('news', [])
|
||||
cat = data.get('category', 'all')
|
||||
score_range = data.get('score_range', '0-10')
|
||||
updated = data.get('updated', '')
|
||||
|
||||
cat_label = 'All Categories' if cat == 'all' else cat.title()
|
||||
|
||||
items_xml = ''
|
||||
for n in news[:50]:
|
||||
title = n.get('title', '')
|
||||
title_zh = n.get('title_zh', '')
|
||||
link = n.get('link', BASE_URL)
|
||||
score = n.get('score')
|
||||
source = n.get('source', '')
|
||||
summary = n.get('summary', '')
|
||||
|
||||
prefix = f'[{score}] ' if score is not None else ''
|
||||
desc = f'<p><strong>Significance:</strong> {score}/10</p>' if score is not None else ''
|
||||
if title_zh:
|
||||
desc += f'<p>🇨🇳 <strong>中文:</strong> {title_zh}</p>'
|
||||
if summary:
|
||||
desc += f'<p><strong>AI Analysis:</strong> {summary}</p>'
|
||||
if source:
|
||||
desc += f'<p><small>Source: {source}</small></p>'
|
||||
|
||||
items_xml += f''' <item>
|
||||
<title>{prefix}{title}</title>
|
||||
<link>{link}</link>
|
||||
<guid isPermaLink="true">{link}</guid>
|
||||
<description><![CDATA[{desc}]]></description>
|
||||
<author>{source or 'News Minimalist'}</author>
|
||||
</item>
|
||||
'''
|
||||
|
||||
return f'''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
|
||||
<channel>
|
||||
<title>News Minimalist — {cat_label} [{score_range}]</title>
|
||||
<link>{BASE_URL}</link>
|
||||
<description>AI-curated significant news. Category: {cat_label}, Score: {score_range}. Scored 0-10 by Gemini.</description>
|
||||
<language>en</language>
|
||||
<lastBuildDate>{updated}</lastBuildDate>
|
||||
<atom:link href="{BASE_URL}/rss" rel="self" type="application/rss+xml"/>
|
||||
{items_xml} </channel>
|
||||
</rss>'''
|
||||
|
||||
|
||||
def generate_html(data):
|
||||
"""Generate beautiful HTML page."""
|
||||
news = data.get('news', [])
|
||||
cache_date = data.get('date', 'unknown')
|
||||
updated = data.get('updated', '')
|
||||
cat = data.get('category', 'all')
|
||||
score_range = data.get('score_range', '0-10')
|
||||
|
||||
if not news:
|
||||
return '''<!DOCTYPE html><html><body style="text-align:center;padding:100px;font-family:sans-serif">
|
||||
<h1>📭 No articles cached yet</h1>
|
||||
<p>Cache will be populated on next scrape cycle.</p>
|
||||
</body></html>'''
|
||||
|
||||
# Group by score tiers
|
||||
hot = [n for n in news if n.get('score') and n['score'] >= 6.5]
|
||||
notable = [n for n in news if n.get('score') and 6.0 <= n['score'] < 6.5]
|
||||
rest = [n for n in news if n.get('score') and n['score'] < 6.0] + [n for n in news if n.get('score') is None]
|
||||
|
||||
def render_items(items, color, badge):
|
||||
html = ''
|
||||
for n in items:
|
||||
score = n.get('score')
|
||||
title = n.get('title', '')
|
||||
title_zh = n.get('title_zh', '')
|
||||
link = n.get('link', '')
|
||||
source = n.get('source', '')
|
||||
summary = n.get('summary', '')
|
||||
|
||||
display_title = title_zh or title
|
||||
subtitle = title if title_zh else ''
|
||||
|
||||
html += f'''<div class="item">
|
||||
<div class="score" style="background:{color}">{badge} {score}</div>
|
||||
<div class="content">
|
||||
<a class="title" href="{link}" target="_blank">{display_title}</a>
|
||||
{f'<div class="title-en">{title}</div>' if subtitle else ''}
|
||||
{f'<p class="summary">{summary}</p>' if summary else ''}
|
||||
<span class="source">{source or 'newsminimalist.com'}</span>
|
||||
</div>
|
||||
</div>'''
|
||||
return html
|
||||
|
||||
body = ''
|
||||
if hot:
|
||||
body += '<h2>🔥 Trending (6.5+)</h2>' + render_items(hot, '#ef4444', '🔥')
|
||||
if notable:
|
||||
body += '<h2>⭐ Notable (6.0-6.4)</h2>' + render_items(notable, '#3b82f6', '⭐')
|
||||
if rest:
|
||||
body += '<h2>📰 All Articles</h2>' + render_items(rest, '#22c55e', '📰')
|
||||
|
||||
return f'''<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>News Minimalist — RSS Feed</title>
|
||||
<style>
|
||||
* {{ margin: 0; padding: 0; box-sizing: border-box; }}
|
||||
body {{ font-family: -apple-system, BlinkMacSystemFont, sans-serif; background: #f8fafc; color: #1e293b; }}
|
||||
.header {{ background: linear-gradient(135deg, #1e293b, #334155); color: white; padding: 32px 20px; text-align: center; }}
|
||||
.header h1 {{ font-size: 2rem; }}
|
||||
.header .meta {{ margin-top: 12px; opacity: 0.8; font-size: 0.9rem; }}
|
||||
.header .rss-btn {{ display: inline-block; margin-top: 12px; background: #f97316; color: white; padding: 8px 20px; border-radius: 20px; text-decoration: none; font-weight: 600; }}
|
||||
.container {{ max-width: 800px; margin: 0 auto; padding: 24px 16px; }}
|
||||
h2 {{ font-size: 1.3rem; margin: 24px 0 12px 0; color: #475569; }}
|
||||
.item {{ display: flex; gap: 16px; background: white; border-radius: 12px; padding: 16px; margin-bottom: 12px; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }}
|
||||
.score {{ min-width: 60px; height: 60px; display: flex; align-items: center; justify-content: center; border-radius: 10px; color: white; font-weight: bold; font-size: 0.95rem; flex-shrink: 0; }}
|
||||
.content {{ flex: 1; }}
|
||||
.title {{ font-size: 1.05rem; color: #1e293b; text-decoration: none; line-height: 1.5; }}
|
||||
.title:hover {{ color: #3b82f6; }}
|
||||
.title-en {{ font-size: 0.8rem; color: #94a3b8; margin-top: 4px; }}
|
||||
.summary {{ color: #64748b; font-size: 0.85rem; margin-top: 6px; line-height: 1.5; }}
|
||||
.source {{ color: #94a3b8; font-size: 0.75rem; margin-top: 8px; display: block; }}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="header">
|
||||
<h1>🤖 News Minimalist</h1>
|
||||
<div class="meta">AI-curated news · {cat} · Score {score_range} · Cache: {cache_date}</div>
|
||||
<div class="meta">{len(news)} articles</div>
|
||||
<a href="/rss" class="rss-btn">📡 RSS Feed</a>
|
||||
</div>
|
||||
<div class="container">
|
||||
{body}
|
||||
</div>
|
||||
</body>
|
||||
</html>'''
|
||||
|
||||
|
||||
BASE_URL = 'https://www.newsminimalist.com'
|
||||
|
||||
|
||||
class Handler(http.server.SimpleHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path in ['/health', '/ping']:
|
||||
data = load_cache()
|
||||
age = time.time() - os.path.getmtime(CACHE_FILE) if os.path.exists(CACHE_FILE) else 99999
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/json')
|
||||
self.end_headers()
|
||||
self.wfile.write(json.dumps({
|
||||
'status': 'ok',
|
||||
'cache_age_hours': round(age / 3600, 1),
|
||||
'article_count': data.get('count', 0),
|
||||
'date': data.get('date', 'unknown'),
|
||||
}).encode())
|
||||
return
|
||||
|
||||
if self.path.startswith('/rss') or self.path == '/feed':
|
||||
data = load_cache()
|
||||
rss = generate_rss(data)
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'application/rss+xml; charset=utf-8')
|
||||
self.send_header('Cache-Control', 'public, max-age=14400')
|
||||
self.send_header('Access-Control-Allow-Origin', '*')
|
||||
self.end_headers()
|
||||
self.wfile.write(rss.encode('utf-8'))
|
||||
return
|
||||
|
||||
if self.path in ['/', '/home', '/index.html']:
|
||||
data = load_cache()
|
||||
html = generate_html(data)
|
||||
self.send_response(200)
|
||||
self.send_header('Content-Type', 'text/html; charset=utf-8')
|
||||
self.end_headers()
|
||||
self.wfile.write(html.encode('utf-8'))
|
||||
return
|
||||
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
self.wfile.write(b'Not Found')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(f'News Minimalist RSS on :{PORT} — scraping newsminimalist.com')
|
||||
with socketserver.TCPServer(('', PORT), Handler) as httpd:
|
||||
httpd.serve_forever()
|
||||
Loading…
x
Reference in New Issue
Block a user