#!/usr/bin/env python3 """newsminimalist-scraper — minimal working + Chinese translation""" import json, re, time, urllib.parse from datetime import datetime, timezone import requests from bs4 import BeautifulSoup CACHE_FILE = '/root/news_cache.json' def translate(text): if not text: return '' try: url = f'https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q={urllib.parse.quote(text[:500])}' r = requests.get(url, timeout=5) if r.status_code == 200: result = r.json() if result and result[0]: return ''.join(p[0] for p in result[0] if p[0]) except: pass return text # === Main scrape logic (verified working) === r = requests.post('http://browserless:3000/content', json={'url': 'https://www.newsminimalist.com/'}, timeout=60) soup = BeautifulSoup(r.text, 'lxml') articles = [] for d in soup.find_all('details'): summary = d.find('summary') if not summary: continue score_span = summary.find('span', title=re.compile('Significance')) if not score_span: continue mr = summary.find('div', class_=re.compile('mr-auto')) if not mr: continue spans = mr.find_all('span', recursive=False) title = '' source = '' link = '' for s in spans: cls = ' '.join(s.get('class', [])) if 'inline-block' in cls: src_text = s.get_text(strip=True).lstrip('(').rstrip(')') m = re.match(r'([^\s+]+)', src_text) if m: source = m.group(1) else: if not title: title = s.get_text(strip=True) if not title: continue for a in d.find_all('a', href=True): href = a['href'] if href.startswith('http') and 'newsminimalist.com' not in href: link = href break score_text = score_span.get_text(strip=True) sm = re.search(r'(\d+\.?\d*)', score_text) score = float(sm.group(1)) if sm else 0 articles.append({ 'title': title, 'link': link or 'https://www.newsminimalist.com', 'score': score, 'source': source, 'summary': '', 'title_zh': '', }) # Dedup & sort seen = set() unique = [a for a in articles if not (a['title'][:80] in seen or seen.add(a['title'][:80]))] unique.sort(key=lambda a: a['score'], reverse=True) print(f'Scraped {len(unique)} articles') # Chinese translation (top 30) for a in unique: try: a['title_zh'] = translate(a['title']) time.sleep(0.15) except: a['title_zh'] = a['title'] translated = sum(1 for a in unique[:30] if a.get('title_zh')) print(f' +{translated} Chinese translations') data = { 'date': datetime.now(timezone.utc).strftime('%Y-%m-%d'), 'updated': datetime.now(timezone.utc).isoformat(), 'count': len(unique), 'news': unique, } with open(CACHE_FILE, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) for a in unique[:3]: zh = a.get('title_zh', '') print(f' [{a["score"]}] {a["title"][:70]}') if zh: print(f' 🇨🇳 {zh[:70]}')