high-risk-scheme-approval-d.../src/clean_certified.py

197 lines
8.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
B1 扩展数据清洗:公司认定危大方案明细
输入: 附件12025年技术方案编制计划表.xlsx → 表1-专项施工方案 (明细表)
输出: certified_schemes.parquet (中东区域·方案级别定数据)
certified_by_project.parquet (项目级汇总·对标用)
"""
import pandas as pd
import openpyxl
from pathlib import Path
import json
import difflib
# ============================================================
# CONFIG
# ============================================================
RAW_DIR = Path("/mnt/y/WorkingEmail/OA收文_Incoming")
CERT_SRC_DIR = RAW_DIR / "2026-03-16_关于公布公司2026年度技术方案编制计划的通知"
OUT_DIR = CERT_SRC_DIR / "cleaned"
OUT_DIR.mkdir(parents=True, exist_ok=True)
CERT_FILES = [
CERT_SRC_DIR / "附件1技术方案含专项施工方案编制计划表-建筑类项目.xlsx",
CERT_SRC_DIR / "附件1技术方案含专项施工方案编制计划表-非建筑轨道类.xlsx",
CERT_SRC_DIR / "附件1技术方案含专项施工方案编制计划表--轨道类项目.xlsx",
]
PLATFORM_PARQUET = Path("/mnt/y/Openclaw_Hub/03.资源/实施项目 wiki/dashboard/data/2026-05-24/cleaned/methods_cleaned.parquet")
# ============================================================
# Step 1: 读取所有文件 表1-专项施工方案(合并+去重)
# ============================================================
print("📖 读取 2026 认定表3文件合并...")
all_rows = []
seen = set() # 去重: (项目, 方案)
for fpath in CERT_FILES:
if not fpath.exists():
print(f" ⚠️ 跳过不存在的文件: {fpath.name}")
continue
wb = openpyxl.load_workbook(fpath, data_only=True)
if '表1-专项施工方案' not in wb.sheetnames:
wb.close()
continue
ws = wb['表1-专项施工方案']
file_count = 0
for row in ws.iter_rows(min_row=3, max_row=ws.max_row, values_only=True):
region = str(row[1]).strip() if row[1] else ''
if region != '中东':
continue
proj = str(row[3]).strip() if row[3] else ''
scheme = str(row[6]).strip() if row[6] else ''
# 跳过无效记录:方案名为空或占位符
if not scheme or scheme in ('/', '-', 'nan', 'None', '/'):
continue
key = (proj, scheme)
if key in seen:
continue # 跨文件重复
seen.add(key)
all_rows.append({
'所属区域': region,
'所属国别': str(row[2]).strip() if row[2] else '',
'项目名称': proj,
'方案名称': scheme,
'编制单位': str(row[7]).strip() if row[7] else '',
'工程类别': str(row[8]).strip() if row[8] else '',
'分部工程类别': str(row[9]).strip() if row[9] else '',
'是否超一定规模': str(row[11]).strip() if len(row) > 11 and row[11] else '',
'计划开工日期': str(row[13])[:10] if len(row) > 13 and row[13] else '',
})
file_count += 1
wb.close()
print(f" {fpath.name}: {file_count} 中东条目")
df_cert = pd.DataFrame(all_rows)
print(f" 中东区域方案: {len(df_cert)}")
print(f" 覆盖项目: {df_cert['项目名称'].nunique()}")
# ============================================================
# Step 2: 项目级汇总
# ============================================================
df_proj = df_cert.groupby('项目名称').agg(
认定_危大方案总数=('方案名称', 'count'),
认定_超规数=('是否超一定规模', lambda x: (x == '').sum()),
).reset_index()
# 国别信息
country_map = df_cert.groupby('项目名称')['所属国别'].first().to_dict()
df_proj['所属国别'] = df_proj['项目名称'].map(country_map)
print(f"\n📊 认定危大方案项目汇总:")
print(f" 项目数: {len(df_proj)}")
print(f" 危大方案总计: {df_proj['认定_危大方案总数'].sum()}")
print(f" 超规总计: {df_proj['认定_超规数'].sum()}")
# ============================================================
# Step 3: 与平台登记数据 JOIN
# ============================================================
if PLATFORM_PARQUET.exists():
df_platform = pd.read_parquet(PLATFORM_PARQUET)
df_platform_valid = df_platform[df_platform['是否有效登记'] == True]
platform_counts = df_platform_valid.groupby('项目名称').agg(
平台_方案总数=('方案名称', 'count'),
平台_超规数=('是否超一定规模', lambda x: (x == '').sum()),
).reset_index()
# 模糊匹配
matched = []
for _, cert_row in df_proj.iterrows():
cert_name = cert_row['项目名称']
cert_clean = cert_name.replace('工程项目', '').replace('项目', '').strip()
best_match = None
best_ratio = 0
for plat_name in platform_counts['项目名称']:
plat_clean = str(plat_name).replace('工程项目', '').replace('项目', '').strip()
ratio = difflib.SequenceMatcher(None, cert_clean, plat_clean).ratio()
if cert_clean in plat_clean or plat_clean in cert_clean:
ratio = max(ratio, 0.95)
if ratio > best_ratio:
best_ratio = ratio
best_match = plat_name
row_dict = cert_row.to_dict()
if best_match and best_ratio >= 0.75:
prow = platform_counts[platform_counts['项目名称'] == best_match].iloc[0]
row_dict['平台_匹配项目'] = best_match
row_dict['平台_方案总数'] = int(prow['平台_方案总数'])
row_dict['平台_超规数'] = int(prow['平台_超规数'])
row_dict['差额'] = int(prow['平台_方案总数']) - row_dict['认定_危大方案总数']
row_dict['匹配状态'] = ''
else:
row_dict['平台_匹配项目'] = ''
row_dict['平台_方案总数'] = 0
row_dict['平台_超规数'] = 0
row_dict['差额'] = -row_dict['认定_危大方案总数']
row_dict['匹配状态'] = '⚠️ 未匹配'
matched.append(row_dict)
df_result = pd.DataFrame(matched)
else:
df_result = df_proj.copy()
print(" ⚠️ 未找到平台 Parquet")
# ============================================================
# Step 4: 汇总
# ============================================================
print(f"\n📊 对标结果:")
print(f" 认定危大方案总计: {df_result['认定_危大方案总数'].sum()}")
if '平台_方案总数' in df_result.columns:
matched_n = (df_result['匹配状态'] == '').sum()
print(f" 与平台匹配: {matched_n}/{len(df_result)}")
print(f" 平台登记方案: {df_result['平台_方案总数'].sum()}")
print(f" 差额: {df_result['平台_方案总数'].sum() - df_result['认定_危大方案总数'].sum()}")
# 漏报预警(平台 < 认定)
under = df_result[df_result['差额'] < 0]
if len(under) > 0:
print(f"\n ⚠️ 漏报项目(平台 < 认定): {len(under)}")
for _, r in under.iterrows():
print(f" 差额{r['差额']:+d}: {r['项目名称'][:50]} (认定{r['认定_危大方案总数']} vs 平台{r['平台_方案总数']})")
# ============================================================
# Step 5: 输出
# ============================================================
# 明细表
DETAIL_PATH = OUT_DIR / "certified_schemes_detail.parquet"
df_cert.to_parquet(DETAIL_PATH, index=False)
df_cert.to_csv(OUT_DIR / "certified_schemes_detail.csv", index=False, encoding='utf-8-sig')
print(f"\n✅ 方案明细: {DETAIL_PATH} ({len(df_cert)} 行)")
# 项目汇总
PROJ_PATH = OUT_DIR / "certified_schemes.parquet"
df_result.to_parquet(PROJ_PATH, index=False)
df_result.to_csv(OUT_DIR / "certified_schemes.csv", index=False, encoding='utf-8-sig')
print(f"✅ 项目汇总: {PROJ_PATH} ({len(df_result)} 行)")
# 校验报告
report = {
"source": str(CERT_FILES),
"sheet": "表1-专项施工方案",
"me_schemes": len(df_cert),
"me_projects": len(df_proj),
"certified_total": int(df_result['认定_危大方案总数'].sum()),
"certified_oversized": int(df_result['认定_超规数'].sum()),
}
with open(OUT_DIR / "certified_validation.json", 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"✅ 校验报告: {OUT_DIR / 'certified_validation.json'}")