newsminimalist RSS pipeline: browserless scraper + server + Chinese translation

2026-05-19 01:04:12 +08:00 · 2026-05-19 01:04:12 +08:00 · 2623da6b7e
commit 2623da6b7e
3 changed files with 352 additions and 0 deletions
--- a/README.md
+++ b/README.md
@ -0,0 +1,40 @@
+# NewsMinimalist RSS
+
+从 [newsminimalist.com](https://www.newsminimalist.com) 抓取 Gemini AI 评分的新闻，生成 RSS Feed + HTML 页面。
+
+## 架构
+
+```
+browserless (Chrome headless)
+  ↓ /content API
+scraper.py → 解析 <details> → 提取 [score] title (source) link
+  ↓ JSON 缓存
+server.py → RSS 2.0 + HTML
+  ↓ NPM 反代
+https://rsshub.arabiancloud.online/newsminimalist
+```
+
+## 文件
+
+| 文件 | 说明 |
+|:-----|:-----|
+| `scraper.py` | 爬虫：browserless 渲染 → BeautifulSoup 解析 → Google 翻译中文 → JSON 缓存 |
+| `server.py` | 服务：读 JSON 缓存 → 输出 RSS 2.0 + Atom + HTML |
+
+## 部署
+
+```bash
+# 1. 拉取浏览器镜像
+docker pull browserless/chrome
+
+# 2. 构建并运行
+docker build -t newsminimalist-rss .
+docker run -d --name newsminimalist-rss -p 1202:1202 \
+  --network rsshub_default \
+  -v /root/news_cache.json:/root/news_cache.json \
+  newsminimalist-rss
+
+# 3. 定时抓取（建议 UTC 02:50, 14:50）
+crontab -e
+50 2,14 * * * docker exec newsminimalist-rss python3 /app/scraper.py
+```
--- a/scraper.py
+++ b/scraper.py
@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""newsminimalist-scraper — minimal working + Chinese translation"""
+import json, re, time, urllib.parse
+from datetime import datetime, timezone
+import requests
+from bs4 import BeautifulSoup
+
+CACHE_FILE = '/root/news_cache.json'
+
+
+def translate(text):
+    if not text: return ''
+    try:
+        url = f'https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q={urllib.parse.quote(text[:500])}'
+        r = requests.get(url, timeout=5)
+        if r.status_code == 200:
+            result = r.json()
+            if result and result[0]:
+                return ''.join(p[0] for p in result[0] if p[0])
+    except: pass
+    return text
+
+
+# === Main scrape logic (verified working) ===
+r = requests.post('http://browserless:3000/content', json={'url': 'https://www.newsminimalist.com/'}, timeout=60)
+soup = BeautifulSoup(r.text, 'lxml')
+
+articles = []
+for d in soup.find_all('details'):
+    summary = d.find('summary')
+    if not summary: continue
+    score_span = summary.find('span', title=re.compile('Significance'))
+    if not score_span: continue
+    mr = summary.find('div', class_=re.compile('mr-auto'))
+    if not mr: continue
+    
+    spans = mr.find_all('span', recursive=False)
+    title = ''
+    source = ''
+    link = ''
+    
+    for s in spans:
+        cls = ' '.join(s.get('class', []))
+        if 'inline-block' in cls:
+            src_text = s.get_text(strip=True).lstrip('(').rstrip(')')
+            m = re.match(r'([^\s+]+)', src_text)
+            if m: source = m.group(1)
+        else:
+            if not title: title = s.get_text(strip=True)
+    
+    if not title: continue
+    
+    for a in d.find_all('a', href=True):
+        href = a['href']
+        if href.startswith('http') and 'newsminimalist.com' not in href:
+            link = href
+            break
+    
+    score_text = score_span.get_text(strip=True)
+    sm = re.search(r'(\d+\.?\d*)', score_text)
+    score = float(sm.group(1)) if sm else 0
+    
+    articles.append({
+        'title': title,
+        'link': link or 'https://www.newsminimalist.com',
+        'score': score,
+        'source': source,
+        'summary': '',
+        'title_zh': '',
+    })
+
+# Dedup & sort
+seen = set()
+unique = [a for a in articles if not (a['title'][:80] in seen or seen.add(a['title'][:80]))]
+unique.sort(key=lambda a: a['score'], reverse=True)
+
+print(f'Scraped {len(unique)} articles')
+
+# Chinese translation (top 30)
+for a in unique:
+    try:
+        a['title_zh'] = translate(a['title'])
+        time.sleep(0.15)
+    except:
+        a['title_zh'] = a['title']
+
+translated = sum(1 for a in unique[:30] if a.get('title_zh'))
+print(f'  +{translated} Chinese translations')
+
+data = {
+    'date': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
+    'updated': datetime.now(timezone.utc).isoformat(),
+    'count': len(unique),
+    'news': unique,
+}
+
+with open(CACHE_FILE, 'w', encoding='utf-8') as f:
+    json.dump(data, f, ensure_ascii=False, indent=2)
+
+for a in unique[:3]:
+    zh = a.get('title_zh', '')
+    print(f'  [{a["score"]}] {a["title"][:70]}')
+    if zh: print(f'       🇨🇳 {zh[:70]}')
--- a/server.py
+++ b/server.py
@ -0,0 +1,209 @@
+#!/usr/bin/env python3
+"""News Minimalist RSS Server — serves RSS/HTML from scraped cache"""
+import http.server
+import socketserver
+import json
+import os
+import time
+from datetime import datetime, timezone
+from collections import defaultdict
+
+PORT = 1202
+CACHE_FILE = '/root/news_cache.json'
+
+
+def load_cache():
+    try:
+        with open(CACHE_FILE, 'r', encoding='utf-8') as f:
+            return json.load(f)
+    except Exception:
+        return {'news': [], 'date': 'unknown', 'count': 0}
+
+
+def generate_rss(data):
+    """Generate RSS 2.0 XML with Atom self-link."""
+    news = data.get('news', [])
+    cat = data.get('category', 'all')
+    score_range = data.get('score_range', '0-10')
+    updated = data.get('updated', '')
+
+    cat_label = 'All Categories' if cat == 'all' else cat.title()
+
+    items_xml = ''
+    for n in news[:50]:
+        title = n.get('title', '')
+        title_zh = n.get('title_zh', '')
+        link = n.get('link', BASE_URL)
+        score = n.get('score')
+        source = n.get('source', '')
+        summary = n.get('summary', '')
+
+        prefix = f'[{score}] ' if score is not None else ''
+        desc = f'<p><strong>Significance:</strong> {score}/10</p>' if score is not None else ''
+        if title_zh:
+            desc += f'<p>🇨🇳 <strong>中文:</strong> {title_zh}</p>'
+        if summary:
+            desc += f'<p><strong>AI Analysis:</strong> {summary}</p>'
+        if source:
+            desc += f'<p><small>Source: {source}</small></p>'
+
+        items_xml += f'''    <item>
+      <title>{prefix}{title}</title>
+      <link>{link}</link>
+      <guid isPermaLink="true">{link}</guid>
+      <description><![CDATA[{desc}]]></description>
+      <author>{source or 'News Minimalist'}</author>
+    </item>
+'''
+
+    return f'''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0" xmlns:atom="http://www.w3.org/2005/Atom">
+  <channel>
+    <title>News Minimalist — {cat_label} [{score_range}]</title>
+    <link>{BASE_URL}</link>
+    <description>AI-curated significant news. Category: {cat_label}, Score: {score_range}. Scored 0-10 by Gemini.</description>
+    <language>en</language>
+    <lastBuildDate>{updated}</lastBuildDate>
+    <atom:link href="{BASE_URL}/rss" rel="self" type="application/rss+xml"/>
+{items_xml}  </channel>
+</rss>'''
+
+
+def generate_html(data):
+    """Generate beautiful HTML page."""
+    news = data.get('news', [])
+    cache_date = data.get('date', 'unknown')
+    updated = data.get('updated', '')
+    cat = data.get('category', 'all')
+    score_range = data.get('score_range', '0-10')
+
+    if not news:
+        return '''<!DOCTYPE html><html><body style="text-align:center;padding:100px;font-family:sans-serif">
+            <h1>📭 No articles cached yet</h1>
+            <p>Cache will be populated on next scrape cycle.</p>
+        </body></html>'''
+
+    # Group by score tiers
+    hot = [n for n in news if n.get('score') and n['score'] >= 6.5]
+    notable = [n for n in news if n.get('score') and 6.0 <= n['score'] < 6.5]
+    rest = [n for n in news if n.get('score') and n['score'] < 6.0] + [n for n in news if n.get('score') is None]
+
+    def render_items(items, color, badge):
+        html = ''
+        for n in items:
+            score = n.get('score')
+            title = n.get('title', '')
+            title_zh = n.get('title_zh', '')
+            link = n.get('link', '')
+            source = n.get('source', '')
+            summary = n.get('summary', '')
+
+            display_title = title_zh or title
+            subtitle = title if title_zh else ''
+
+            html += f'''<div class="item">
+                <div class="score" style="background:{color}">{badge} {score}</div>
+                <div class="content">
+                    <a class="title" href="{link}" target="_blank">{display_title}</a>
+                    {f'<div class="title-en">{title}</div>' if subtitle else ''}
+                    {f'<p class="summary">{summary}</p>' if summary else ''}
+                    <span class="source">{source or 'newsminimalist.com'}</span>
+                </div>
+            </div>'''
+        return html
+
+    body = ''
+    if hot:
+        body += '<h2>🔥 Trending (6.5+)</h2>' + render_items(hot, '#ef4444', '🔥')
+    if notable:
+        body += '<h2>⭐ Notable (6.0-6.4)</h2>' + render_items(notable, '#3b82f6', '⭐')
+    if rest:
+        body += '<h2>📰 All Articles</h2>' + render_items(rest, '#22c55e', '📰')
+
+    return f'''<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>News Minimalist — RSS Feed</title>
+    <style>
+        * {{ margin: 0; padding: 0; box-sizing: border-box; }}
+        body {{ font-family: -apple-system, BlinkMacSystemFont, sans-serif; background: #f8fafc; color: #1e293b; }}
+        .header {{ background: linear-gradient(135deg, #1e293b, #334155); color: white; padding: 32px 20px; text-align: center; }}
+        .header h1 {{ font-size: 2rem; }}
+        .header .meta {{ margin-top: 12px; opacity: 0.8; font-size: 0.9rem; }}
+        .header .rss-btn {{ display: inline-block; margin-top: 12px; background: #f97316; color: white; padding: 8px 20px; border-radius: 20px; text-decoration: none; font-weight: 600; }}
+        .container {{ max-width: 800px; margin: 0 auto; padding: 24px 16px; }}
+        h2 {{ font-size: 1.3rem; margin: 24px 0 12px 0; color: #475569; }}
+        .item {{ display: flex; gap: 16px; background: white; border-radius: 12px; padding: 16px; margin-bottom: 12px; box-shadow: 0 1px 3px rgba(0,0,0,0.08); }}
+        .score {{ min-width: 60px; height: 60px; display: flex; align-items: center; justify-content: center; border-radius: 10px; color: white; font-weight: bold; font-size: 0.95rem; flex-shrink: 0; }}
+        .content {{ flex: 1; }}
+        .title {{ font-size: 1.05rem; color: #1e293b; text-decoration: none; line-height: 1.5; }}
+        .title:hover {{ color: #3b82f6; }}
+        .title-en {{ font-size: 0.8rem; color: #94a3b8; margin-top: 4px; }}
+        .summary {{ color: #64748b; font-size: 0.85rem; margin-top: 6px; line-height: 1.5; }}
+        .source {{ color: #94a3b8; font-size: 0.75rem; margin-top: 8px; display: block; }}
+    </style>
+</head>
+<body>
+    <div class="header">
+        <h1>🤖 News Minimalist</h1>
+        <div class="meta">AI-curated news · {cat} · Score {score_range} · Cache: {cache_date}</div>
+        <div class="meta">{len(news)} articles</div>
+        <a href="/rss" class="rss-btn">📡 RSS Feed</a>
+    </div>
+    <div class="container">
+        {body}
+    </div>
+</body>
+</html>'''
+
+
+BASE_URL = 'https://www.newsminimalist.com'
+
+
+class Handler(http.server.SimpleHTTPRequestHandler):
+    def do_GET(self):
+        if self.path in ['/health', '/ping']:
+            data = load_cache()
+            age = time.time() - os.path.getmtime(CACHE_FILE) if os.path.exists(CACHE_FILE) else 99999
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/json')
+            self.end_headers()
+            self.wfile.write(json.dumps({
+                'status': 'ok',
+                'cache_age_hours': round(age / 3600, 1),
+                'article_count': data.get('count', 0),
+                'date': data.get('date', 'unknown'),
+            }).encode())
+            return
+
+        if self.path.startswith('/rss') or self.path == '/feed':
+            data = load_cache()
+            rss = generate_rss(data)
+            self.send_response(200)
+            self.send_header('Content-Type', 'application/rss+xml; charset=utf-8')
+            self.send_header('Cache-Control', 'public, max-age=14400')
+            self.send_header('Access-Control-Allow-Origin', '*')
+            self.end_headers()
+            self.wfile.write(rss.encode('utf-8'))
+            return
+
+        if self.path in ['/', '/home', '/index.html']:
+            data = load_cache()
+            html = generate_html(data)
+            self.send_response(200)
+            self.send_header('Content-Type', 'text/html; charset=utf-8')
+            self.end_headers()
+            self.wfile.write(html.encode('utf-8'))
+            return
+
+        self.send_response(404)
+        self.end_headers()
+        self.wfile.write(b'Not Found')
+
+
+if __name__ == '__main__':
+    print(f'News Minimalist RSS on :{PORT} — scraping newsminimalist.com')
+    with socketserver.TCPServer(('', PORT), Handler) as httpd:
+        httpd.serve_forever()