1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
| import re import csv import datetime import pandas as pd
CHECK_CODE_MAP = { 0: '1', 1: '0', 2: 'X', 3: '9', 4: '8', 5: '7', 6: '6', 7: '5', 8: '4', 9: '3', 10: '2' } ID_WEIGHTS = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2]
def id_valid_format(id_no: str) -> bool: if len(id_no) != 18: return False if not id_no[:17].isdigit(): return False if not (id_no[17].isdigit() or id_no[17] in ('X', 'x')): return False return True
def id_checksum(id_no: str) -> bool: if not id_valid_format(id_no): return False total = sum(int(d) * w for d, w in zip(id_no[:17], ID_WEIGHTS)) mod = total % 11 expected = CHECK_CODE_MAP[mod] return expected.upper() == id_no[17].upper()
def id_sex_from_code(id_no: str) -> str: sex_digit = int(id_no[16]) return '男' if sex_digit % 2 == 1 else '女'
def id_birth_from_code(id_no: str) -> str: year = id_no[6:10] month = id_no[10:12] day = id_no[12:14] return f"{year}-{month}-{day}"
def name_valid(name: str) -> bool: if not (2 <= len(name) <= 4): return False return bool(re.fullmatch(r'[\u4e00-\u9fa5]{2,4}', name))
def phone_valid(phone: str) -> bool: return bool(re.fullmatch(r'1\d{10}', phone))
def parse_date(date_str: str) -> datetime.date: for fmt in ('%Y-%m-%d', '%Y/%m/%d', '%Y%m%d', '%Y.%m.%d'): try: return datetime.datetime.strptime(date_str.strip(), fmt).date() except ValueError: continue raise ValueError(f"无法解析出生日期: {date_str}")
def parse_datetime(dt_str: str) -> datetime.datetime: for fmt in ('%Y-%m-%d %H:%M:%S', '%Y/%m/%d %H:%M:%S', '%Y-%m-%d %H:%M', '%Y/%m/%d %H:%M', '%Y-%m-%d', '%Y/%m/%d'): try: return datetime.datetime.strptime(dt_str.strip(), fmt) except ValueError: continue raise ValueError(f"无法解析时间: {dt_str}")
def datetime_logic_valid(birth: datetime.date, register: datetime.datetime, last_login: datetime.datetime) -> bool: if birth >= register.date(): return False if register > last_login: return False return True
def validate_record(row: pd.Series) -> tuple[bool, str]:
try: id_no = str(row['身份证号']).strip() if not id_valid_format(id_no): return False, "身份证格式错误" if not id_checksum(id_no): return False, "身份证校验码错误"
sex_id = id_sex_from_code(id_no) sex_field = str(row['性别']).strip() if sex_id != sex_field: return False, f"性别不匹配(身份证: {sex_id} vs 字段: {sex_field})"
birth_id = id_birth_from_code(id_no) birth_field = parse_date(str(row['出生日期'])) if birth_id != birth_field.strftime('%Y-%m-%d'): return False, f"出生日期不匹配(身份证: {birth_id} vs 字段: {birth_field})"
phone = str(row['手机号']).strip() if not phone_valid(phone): return False, "手机号格式错误"
register_dt = parse_datetime(str(row['注册时间'])) last_login_dt = parse_datetime(str(row['最后登录时间'])) if not datetime_logic_valid(birth_field, register_dt, last_login_dt): return False, "时间逻辑错误"
name = str(row['姓名']).strip() if not name_valid(name): return False, "姓名格式错误"
return True, "合规" except Exception as e: return False, f"异常: {e}"
def main(): col_names = ['客户id', '姓名', '身份证号', '性别', '手机号', '出生日期', '注册时间', '最后登录时间'] df = pd.read_csv('data.csv', header=None, names=col_names, dtype=str, keep_default_na=False, na_values=['', 'NA', 'N/A'])
valid_rows = [] invalid_rows = []
for idx, row in df.iterrows(): is_ok, msg = validate_record(row) if is_ok: valid_rows.append(row) else: invalid_rows.append(row)
pd.DataFrame(valid_rows, columns=col_names).to_csv('output_valid.csv', index=False, encoding='utf-8-sig')
if invalid_rows: df_invalid = pd.DataFrame(invalid_rows, columns=col_names) df_invalid.to_csv('output_invalid.csv', index=False, encoding='utf-8-sig')
print(f"共 {len(df)} 条记录,合规 {len(valid_rows)} 条,违规 {len(invalid_rows)} 条。") print("合规数据已写入 output_valid.csv") if invalid_rows: print("违规数据已写入 output_invalid.csv(若需要,可自行扩展错误原因字段)")
if __name__ == "__main__": main()
|