104 lines
3.0 KiB
Python

#!/usr/bin/env python3
"""newsminimalist-scraper — minimal working + Chinese translation"""
import json, re, time, urllib.parse
from datetime import datetime, timezone
import requests
from bs4 import BeautifulSoup
CACHE_FILE = '/root/news_cache.json'
def translate(text):
if not text: return ''
try:
url = f'https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q={urllib.parse.quote(text[:500])}'
r = requests.get(url, timeout=5)
if r.status_code == 200:
result = r.json()
if result and result[0]:
return ''.join(p[0] for p in result[0] if p[0])
except: pass
return text
# === Main scrape logic (verified working) ===
r = requests.post('http://browserless:3000/content', json={'url': 'https://www.newsminimalist.com/'}, timeout=60)
soup = BeautifulSoup(r.text, 'lxml')
articles = []
for d in soup.find_all('details'):
summary = d.find('summary')
if not summary: continue
score_span = summary.find('span', title=re.compile('Significance'))
if not score_span: continue
mr = summary.find('div', class_=re.compile('mr-auto'))
if not mr: continue
spans = mr.find_all('span', recursive=False)
title = ''
source = ''
link = ''
for s in spans:
cls = ' '.join(s.get('class', []))
if 'inline-block' in cls:
src_text = s.get_text(strip=True).lstrip('(').rstrip(')')
m = re.match(r'([^\s+]+)', src_text)
if m: source = m.group(1)
else:
if not title: title = s.get_text(strip=True)
if not title: continue
for a in d.find_all('a', href=True):
href = a['href']
if href.startswith('http') and 'newsminimalist.com' not in href:
link = href
break
score_text = score_span.get_text(strip=True)
sm = re.search(r'(\d+\.?\d*)', score_text)
score = float(sm.group(1)) if sm else 0
articles.append({
'title': title,
'link': link or 'https://www.newsminimalist.com',
'score': score,
'source': source,
'summary': '',
'title_zh': '',
})
# Dedup & sort
seen = set()
unique = [a for a in articles if not (a['title'][:80] in seen or seen.add(a['title'][:80]))]
unique.sort(key=lambda a: a['score'], reverse=True)
print(f'Scraped {len(unique)} articles')
# Chinese translation (top 30)
for a in unique:
try:
a['title_zh'] = translate(a['title'])
time.sleep(0.15)
except:
a['title_zh'] = a['title']
translated = sum(1 for a in unique[:30] if a.get('title_zh'))
print(f' +{translated} Chinese translations')
data = {
'date': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
'updated': datetime.now(timezone.utc).isoformat(),
'count': len(unique),
'news': unique,
}
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
for a in unique[:3]:
zh = a.get('title_zh', '')
print(f' [{a["score"]}] {a["title"][:70]}')
if zh: print(f' 🇨🇳 {zh[:70]}')