#!/usr/bin/env python3 """ B1 扩展数据清洗:公司认定危大方案明细 输入: 附件1:2025年技术方案编制计划表.xlsx → 表1-专项施工方案 (明细表) 输出: certified_schemes.parquet (中东区域·方案级别定数据) certified_by_project.parquet (项目级汇总·对标用) """ import pandas as pd import openpyxl from pathlib import Path import json import difflib # ============================================================ # CONFIG # ============================================================ RAW_DIR = Path("/mnt/y/WorkingEmail/OA收文_Incoming") CERT_SRC_DIR = RAW_DIR / "2026-03-16_关于公布公司2026年度技术方案编制计划的通知" OUT_DIR = CERT_SRC_DIR / "cleaned" OUT_DIR.mkdir(parents=True, exist_ok=True) CERT_FILES = [ CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表-建筑类项目.xlsx", CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表-非建筑轨道类.xlsx", CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表--轨道类项目.xlsx", ] PLATFORM_PARQUET = Path("/mnt/y/Openclaw_Hub/03.资源/实施项目 wiki/dashboard/data/2026-05-24/cleaned/methods_cleaned.parquet") # ============================================================ # Step 1: 读取所有文件 表1-专项施工方案(合并+去重) # ============================================================ print("📖 读取 2026 认定表(3文件合并)...") all_rows = [] seen = set() # 去重: (项目, 方案) for fpath in CERT_FILES: if not fpath.exists(): print(f" ⚠️ 跳过不存在的文件: {fpath.name}") continue wb = openpyxl.load_workbook(fpath, data_only=True) if '表1-专项施工方案' not in wb.sheetnames: wb.close() continue ws = wb['表1-专项施工方案'] file_count = 0 for row in ws.iter_rows(min_row=3, max_row=ws.max_row, values_only=True): region = str(row[1]).strip() if row[1] else '' if region != '中东': continue proj = str(row[3]).strip() if row[3] else '' scheme = str(row[6]).strip() if row[6] else '' # 跳过无效记录:方案名为空或占位符 if not scheme or scheme in ('/', '-', 'nan', 'None', '/'): continue key = (proj, scheme) if key in seen: continue # 跨文件重复 seen.add(key) all_rows.append({ '所属区域': region, '所属国别': str(row[2]).strip() if row[2] else '', '项目名称': proj, '方案名称': scheme, '编制单位': str(row[7]).strip() if row[7] else '', '工程类别': str(row[8]).strip() if row[8] else '', '分部工程类别': str(row[9]).strip() if row[9] else '', '是否超一定规模': str(row[11]).strip() if len(row) > 11 and row[11] else '', '计划开工日期': str(row[13])[:10] if len(row) > 13 and row[13] else '', }) file_count += 1 wb.close() print(f" {fpath.name}: {file_count} 中东条目") df_cert = pd.DataFrame(all_rows) print(f" 中东区域方案: {len(df_cert)}") print(f" 覆盖项目: {df_cert['项目名称'].nunique()}") # ============================================================ # Step 2: 项目级汇总 # ============================================================ df_proj = df_cert.groupby('项目名称').agg( 认定_危大方案总数=('方案名称', 'count'), 认定_超规数=('是否超一定规模', lambda x: (x == '是').sum()), ).reset_index() # 国别信息 country_map = df_cert.groupby('项目名称')['所属国别'].first().to_dict() df_proj['所属国别'] = df_proj['项目名称'].map(country_map) print(f"\n📊 认定危大方案项目汇总:") print(f" 项目数: {len(df_proj)}") print(f" 危大方案总计: {df_proj['认定_危大方案总数'].sum()}") print(f" 超规总计: {df_proj['认定_超规数'].sum()}") # ============================================================ # Step 3: 与平台登记数据 JOIN # ============================================================ if PLATFORM_PARQUET.exists(): df_platform = pd.read_parquet(PLATFORM_PARQUET) df_platform_valid = df_platform[df_platform['是否有效登记'] == True] platform_counts = df_platform_valid.groupby('项目名称').agg( 平台_方案总数=('方案名称', 'count'), 平台_超规数=('是否超一定规模', lambda x: (x == '是').sum()), ).reset_index() # 模糊匹配 matched = [] for _, cert_row in df_proj.iterrows(): cert_name = cert_row['项目名称'] cert_clean = cert_name.replace('工程项目', '').replace('项目', '').strip() best_match = None best_ratio = 0 for plat_name in platform_counts['项目名称']: plat_clean = str(plat_name).replace('工程项目', '').replace('项目', '').strip() ratio = difflib.SequenceMatcher(None, cert_clean, plat_clean).ratio() if cert_clean in plat_clean or plat_clean in cert_clean: ratio = max(ratio, 0.95) if ratio > best_ratio: best_ratio = ratio best_match = plat_name row_dict = cert_row.to_dict() if best_match and best_ratio >= 0.75: prow = platform_counts[platform_counts['项目名称'] == best_match].iloc[0] row_dict['平台_匹配项目'] = best_match row_dict['平台_方案总数'] = int(prow['平台_方案总数']) row_dict['平台_超规数'] = int(prow['平台_超规数']) row_dict['差额'] = int(prow['平台_方案总数']) - row_dict['认定_危大方案总数'] row_dict['匹配状态'] = '✅' else: row_dict['平台_匹配项目'] = '' row_dict['平台_方案总数'] = 0 row_dict['平台_超规数'] = 0 row_dict['差额'] = -row_dict['认定_危大方案总数'] row_dict['匹配状态'] = '⚠️ 未匹配' matched.append(row_dict) df_result = pd.DataFrame(matched) else: df_result = df_proj.copy() print(" ⚠️ 未找到平台 Parquet") # ============================================================ # Step 4: 汇总 # ============================================================ print(f"\n📊 对标结果:") print(f" 认定危大方案总计: {df_result['认定_危大方案总数'].sum()}") if '平台_方案总数' in df_result.columns: matched_n = (df_result['匹配状态'] == '✅').sum() print(f" 与平台匹配: {matched_n}/{len(df_result)}") print(f" 平台登记方案: {df_result['平台_方案总数'].sum()}") print(f" 差额: {df_result['平台_方案总数'].sum() - df_result['认定_危大方案总数'].sum()}") # 漏报预警(平台 < 认定) under = df_result[df_result['差额'] < 0] if len(under) > 0: print(f"\n ⚠️ 漏报项目(平台 < 认定): {len(under)}") for _, r in under.iterrows(): print(f" 差额{r['差额']:+d}: {r['项目名称'][:50]} (认定{r['认定_危大方案总数']} vs 平台{r['平台_方案总数']})") # ============================================================ # Step 5: 输出 # ============================================================ # 明细表 DETAIL_PATH = OUT_DIR / "certified_schemes_detail.parquet" df_cert.to_parquet(DETAIL_PATH, index=False) df_cert.to_csv(OUT_DIR / "certified_schemes_detail.csv", index=False, encoding='utf-8-sig') print(f"\n✅ 方案明细: {DETAIL_PATH} ({len(df_cert)} 行)") # 项目汇总 PROJ_PATH = OUT_DIR / "certified_schemes.parquet" df_result.to_parquet(PROJ_PATH, index=False) df_result.to_csv(OUT_DIR / "certified_schemes.csv", index=False, encoding='utf-8-sig') print(f"✅ 项目汇总: {PROJ_PATH} ({len(df_result)} 行)") # 校验报告 report = { "source": str(CERT_FILES), "sheet": "表1-专项施工方案", "me_schemes": len(df_cert), "me_projects": len(df_proj), "certified_total": int(df_result['认定_危大方案总数'].sum()), "certified_oversized": int(df_result['认定_超规数'].sum()), } with open(OUT_DIR / "certified_validation.json", 'w', encoding='utf-8') as f: json.dump(report, f, ensure_ascii=False, indent=2) print(f"✅ 校验报告: {OUT_DIR / 'certified_validation.json'}")