197 lines
8.4 KiB
Python
197 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
B1 扩展数据清洗:公司认定危大方案明细
|
||
输入: 附件1:2025年技术方案编制计划表.xlsx → 表1-专项施工方案 (明细表)
|
||
输出: certified_schemes.parquet (中东区域·方案级别定数据)
|
||
certified_by_project.parquet (项目级汇总·对标用)
|
||
"""
|
||
import pandas as pd
|
||
import openpyxl
|
||
from pathlib import Path
|
||
import json
|
||
import difflib
|
||
|
||
# ============================================================
|
||
# CONFIG
|
||
# ============================================================
|
||
RAW_DIR = Path("/mnt/y/WorkingEmail/OA收文_Incoming")
|
||
CERT_SRC_DIR = RAW_DIR / "2026-03-16_关于公布公司2026年度技术方案编制计划的通知"
|
||
OUT_DIR = CERT_SRC_DIR / "cleaned"
|
||
OUT_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
||
CERT_FILES = [
|
||
CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表-建筑类项目.xlsx",
|
||
CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表-非建筑轨道类.xlsx",
|
||
CERT_SRC_DIR / "附件1:技术方案(含专项施工方案)编制计划表--轨道类项目.xlsx",
|
||
]
|
||
PLATFORM_PARQUET = Path("/mnt/y/Openclaw_Hub/03.资源/实施项目 wiki/dashboard/data/2026-05-24/cleaned/methods_cleaned.parquet")
|
||
|
||
# ============================================================
|
||
# Step 1: 读取所有文件 表1-专项施工方案(合并+去重)
|
||
# ============================================================
|
||
print("📖 读取 2026 认定表(3文件合并)...")
|
||
all_rows = []
|
||
seen = set() # 去重: (项目, 方案)
|
||
|
||
for fpath in CERT_FILES:
|
||
if not fpath.exists():
|
||
print(f" ⚠️ 跳过不存在的文件: {fpath.name}")
|
||
continue
|
||
|
||
wb = openpyxl.load_workbook(fpath, data_only=True)
|
||
if '表1-专项施工方案' not in wb.sheetnames:
|
||
wb.close()
|
||
continue
|
||
|
||
ws = wb['表1-专项施工方案']
|
||
file_count = 0
|
||
for row in ws.iter_rows(min_row=3, max_row=ws.max_row, values_only=True):
|
||
region = str(row[1]).strip() if row[1] else ''
|
||
if region != '中东':
|
||
continue
|
||
|
||
proj = str(row[3]).strip() if row[3] else ''
|
||
scheme = str(row[6]).strip() if row[6] else ''
|
||
# 跳过无效记录:方案名为空或占位符
|
||
if not scheme or scheme in ('/', '-', 'nan', 'None', '/'):
|
||
continue
|
||
key = (proj, scheme)
|
||
if key in seen:
|
||
continue # 跨文件重复
|
||
seen.add(key)
|
||
|
||
all_rows.append({
|
||
'所属区域': region,
|
||
'所属国别': str(row[2]).strip() if row[2] else '',
|
||
'项目名称': proj,
|
||
'方案名称': scheme,
|
||
'编制单位': str(row[7]).strip() if row[7] else '',
|
||
'工程类别': str(row[8]).strip() if row[8] else '',
|
||
'分部工程类别': str(row[9]).strip() if row[9] else '',
|
||
'是否超一定规模': str(row[11]).strip() if len(row) > 11 and row[11] else '',
|
||
'计划开工日期': str(row[13])[:10] if len(row) > 13 and row[13] else '',
|
||
})
|
||
file_count += 1
|
||
|
||
wb.close()
|
||
print(f" {fpath.name}: {file_count} 中东条目")
|
||
|
||
df_cert = pd.DataFrame(all_rows)
|
||
print(f" 中东区域方案: {len(df_cert)}")
|
||
print(f" 覆盖项目: {df_cert['项目名称'].nunique()}")
|
||
|
||
# ============================================================
|
||
# Step 2: 项目级汇总
|
||
# ============================================================
|
||
df_proj = df_cert.groupby('项目名称').agg(
|
||
认定_危大方案总数=('方案名称', 'count'),
|
||
认定_超规数=('是否超一定规模', lambda x: (x == '是').sum()),
|
||
).reset_index()
|
||
|
||
# 国别信息
|
||
country_map = df_cert.groupby('项目名称')['所属国别'].first().to_dict()
|
||
df_proj['所属国别'] = df_proj['项目名称'].map(country_map)
|
||
|
||
print(f"\n📊 认定危大方案项目汇总:")
|
||
print(f" 项目数: {len(df_proj)}")
|
||
print(f" 危大方案总计: {df_proj['认定_危大方案总数'].sum()}")
|
||
print(f" 超规总计: {df_proj['认定_超规数'].sum()}")
|
||
|
||
# ============================================================
|
||
# Step 3: 与平台登记数据 JOIN
|
||
# ============================================================
|
||
if PLATFORM_PARQUET.exists():
|
||
df_platform = pd.read_parquet(PLATFORM_PARQUET)
|
||
df_platform_valid = df_platform[df_platform['是否有效登记'] == True]
|
||
|
||
platform_counts = df_platform_valid.groupby('项目名称').agg(
|
||
平台_方案总数=('方案名称', 'count'),
|
||
平台_超规数=('是否超一定规模', lambda x: (x == '是').sum()),
|
||
).reset_index()
|
||
|
||
# 模糊匹配
|
||
matched = []
|
||
for _, cert_row in df_proj.iterrows():
|
||
cert_name = cert_row['项目名称']
|
||
cert_clean = cert_name.replace('工程项目', '').replace('项目', '').strip()
|
||
|
||
best_match = None
|
||
best_ratio = 0
|
||
|
||
for plat_name in platform_counts['项目名称']:
|
||
plat_clean = str(plat_name).replace('工程项目', '').replace('项目', '').strip()
|
||
ratio = difflib.SequenceMatcher(None, cert_clean, plat_clean).ratio()
|
||
if cert_clean in plat_clean or plat_clean in cert_clean:
|
||
ratio = max(ratio, 0.95)
|
||
if ratio > best_ratio:
|
||
best_ratio = ratio
|
||
best_match = plat_name
|
||
|
||
row_dict = cert_row.to_dict()
|
||
if best_match and best_ratio >= 0.75:
|
||
prow = platform_counts[platform_counts['项目名称'] == best_match].iloc[0]
|
||
row_dict['平台_匹配项目'] = best_match
|
||
row_dict['平台_方案总数'] = int(prow['平台_方案总数'])
|
||
row_dict['平台_超规数'] = int(prow['平台_超规数'])
|
||
row_dict['差额'] = int(prow['平台_方案总数']) - row_dict['认定_危大方案总数']
|
||
row_dict['匹配状态'] = '✅'
|
||
else:
|
||
row_dict['平台_匹配项目'] = ''
|
||
row_dict['平台_方案总数'] = 0
|
||
row_dict['平台_超规数'] = 0
|
||
row_dict['差额'] = -row_dict['认定_危大方案总数']
|
||
row_dict['匹配状态'] = '⚠️ 未匹配'
|
||
|
||
matched.append(row_dict)
|
||
|
||
df_result = pd.DataFrame(matched)
|
||
else:
|
||
df_result = df_proj.copy()
|
||
print(" ⚠️ 未找到平台 Parquet")
|
||
|
||
# ============================================================
|
||
# Step 4: 汇总
|
||
# ============================================================
|
||
print(f"\n📊 对标结果:")
|
||
print(f" 认定危大方案总计: {df_result['认定_危大方案总数'].sum()}")
|
||
if '平台_方案总数' in df_result.columns:
|
||
matched_n = (df_result['匹配状态'] == '✅').sum()
|
||
print(f" 与平台匹配: {matched_n}/{len(df_result)}")
|
||
print(f" 平台登记方案: {df_result['平台_方案总数'].sum()}")
|
||
print(f" 差额: {df_result['平台_方案总数'].sum() - df_result['认定_危大方案总数'].sum()}")
|
||
|
||
# 漏报预警(平台 < 认定)
|
||
under = df_result[df_result['差额'] < 0]
|
||
if len(under) > 0:
|
||
print(f"\n ⚠️ 漏报项目(平台 < 认定): {len(under)}")
|
||
for _, r in under.iterrows():
|
||
print(f" 差额{r['差额']:+d}: {r['项目名称'][:50]} (认定{r['认定_危大方案总数']} vs 平台{r['平台_方案总数']})")
|
||
|
||
# ============================================================
|
||
# Step 5: 输出
|
||
# ============================================================
|
||
# 明细表
|
||
DETAIL_PATH = OUT_DIR / "certified_schemes_detail.parquet"
|
||
df_cert.to_parquet(DETAIL_PATH, index=False)
|
||
df_cert.to_csv(OUT_DIR / "certified_schemes_detail.csv", index=False, encoding='utf-8-sig')
|
||
print(f"\n✅ 方案明细: {DETAIL_PATH} ({len(df_cert)} 行)")
|
||
|
||
# 项目汇总
|
||
PROJ_PATH = OUT_DIR / "certified_schemes.parquet"
|
||
df_result.to_parquet(PROJ_PATH, index=False)
|
||
df_result.to_csv(OUT_DIR / "certified_schemes.csv", index=False, encoding='utf-8-sig')
|
||
print(f"✅ 项目汇总: {PROJ_PATH} ({len(df_result)} 行)")
|
||
|
||
# 校验报告
|
||
report = {
|
||
"source": str(CERT_FILES),
|
||
"sheet": "表1-专项施工方案",
|
||
"me_schemes": len(df_cert),
|
||
"me_projects": len(df_proj),
|
||
"certified_total": int(df_result['认定_危大方案总数'].sum()),
|
||
"certified_oversized": int(df_result['认定_超规数'].sum()),
|
||
}
|
||
with open(OUT_DIR / "certified_validation.json", 'w', encoding='utf-8') as f:
|
||
json.dump(report, f, ensure_ascii=False, indent=2)
|
||
print(f"✅ 校验报告: {OUT_DIR / 'certified_validation.json'}")
|