104 lines
3.0 KiB
Python
104 lines
3.0 KiB
Python
#!/usr/bin/env python3
|
|
"""newsminimalist-scraper — minimal working + Chinese translation"""
|
|
import json, re, time, urllib.parse
|
|
from datetime import datetime, timezone
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
CACHE_FILE = '/root/news_cache.json'
|
|
|
|
|
|
def translate(text):
|
|
if not text: return ''
|
|
try:
|
|
url = f'https://translate.googleapis.com/translate_a/single?client=gtx&sl=en&tl=zh-CN&dt=t&q={urllib.parse.quote(text[:500])}'
|
|
r = requests.get(url, timeout=5)
|
|
if r.status_code == 200:
|
|
result = r.json()
|
|
if result and result[0]:
|
|
return ''.join(p[0] for p in result[0] if p[0])
|
|
except: pass
|
|
return text
|
|
|
|
|
|
# === Main scrape logic (verified working) ===
|
|
r = requests.post('http://browserless:3000/content', json={'url': 'https://www.newsminimalist.com/'}, timeout=60)
|
|
soup = BeautifulSoup(r.text, 'lxml')
|
|
|
|
articles = []
|
|
for d in soup.find_all('details'):
|
|
summary = d.find('summary')
|
|
if not summary: continue
|
|
score_span = summary.find('span', title=re.compile('Significance'))
|
|
if not score_span: continue
|
|
mr = summary.find('div', class_=re.compile('mr-auto'))
|
|
if not mr: continue
|
|
|
|
spans = mr.find_all('span', recursive=False)
|
|
title = ''
|
|
source = ''
|
|
link = ''
|
|
|
|
for s in spans:
|
|
cls = ' '.join(s.get('class', []))
|
|
if 'inline-block' in cls:
|
|
src_text = s.get_text(strip=True).lstrip('(').rstrip(')')
|
|
m = re.match(r'([^\s+]+)', src_text)
|
|
if m: source = m.group(1)
|
|
else:
|
|
if not title: title = s.get_text(strip=True)
|
|
|
|
if not title: continue
|
|
|
|
for a in d.find_all('a', href=True):
|
|
href = a['href']
|
|
if href.startswith('http') and 'newsminimalist.com' not in href:
|
|
link = href
|
|
break
|
|
|
|
score_text = score_span.get_text(strip=True)
|
|
sm = re.search(r'(\d+\.?\d*)', score_text)
|
|
score = float(sm.group(1)) if sm else 0
|
|
|
|
articles.append({
|
|
'title': title,
|
|
'link': link or 'https://www.newsminimalist.com',
|
|
'score': score,
|
|
'source': source,
|
|
'summary': '',
|
|
'title_zh': '',
|
|
})
|
|
|
|
# Dedup & sort
|
|
seen = set()
|
|
unique = [a for a in articles if not (a['title'][:80] in seen or seen.add(a['title'][:80]))]
|
|
unique.sort(key=lambda a: a['score'], reverse=True)
|
|
|
|
print(f'Scraped {len(unique)} articles')
|
|
|
|
# Chinese translation (top 30)
|
|
for a in unique:
|
|
try:
|
|
a['title_zh'] = translate(a['title'])
|
|
time.sleep(0.15)
|
|
except:
|
|
a['title_zh'] = a['title']
|
|
|
|
translated = sum(1 for a in unique[:30] if a.get('title_zh'))
|
|
print(f' +{translated} Chinese translations')
|
|
|
|
data = {
|
|
'date': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
|
|
'updated': datetime.now(timezone.utc).isoformat(),
|
|
'count': len(unique),
|
|
'news': unique,
|
|
}
|
|
|
|
with open(CACHE_FILE, 'w', encoding='utf-8') as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
for a in unique[:3]:
|
|
zh = a.get('title_zh', '')
|
|
print(f' [{a["score"]}] {a["title"][:70]}')
|
|
if zh: print(f' 🇨🇳 {zh[:70]}')
|