3.5 KiB
3.5 KiB
In [ ]:
# 属性统计 def parse_attr_set(attr_set_str): """解析属性集,返回 {属性值: 属性名} 的字典""" mapping = {} for item in attr_set_str.split(";"): item = item.strip() if not item: continue try: left, value = item.split("~", 1) # "231012:大小~23206184:140*80*75" attr_name = left.split(":")[1] value = value.split(":")[1] mapping[value] = attr_name except Exception: continue return mapping def map_spec_to_attrs(spec, attr_mapping): """把规格里的值映射为 {属性名: 规格值}""" results = {} if not isinstance(attr_mapping, dict): # 如果不是字典,直接返回 None return None for val in str(spec).split(): # spec 也转成 str,避免 NaN if val in attr_mapping: attr_name = attr_mapping[val] results[attr_name] = val return results if results else None
In [ ]:
import pandas as pd from utils.gtools import MySQLconnect batch_size = 1000000 offset = 0 results = [] leimu = "furniture" with MySQLconnect('ods') as ods: while True: sql = f""" SELECT * FROM erp_{leimu}_sku LIMIT {batch_size} OFFSET {offset} """ chunk = pd.read_sql(sql, ods.con) if chunk.empty: break chunk["规格属性映射"] = chunk.apply( lambda row: map_spec_to_attrs(row["规格"], parse_attr_set(row["标准/预设属性集"])), axis=1 ) print(f"处理了 {offset} 到 {offset+batch_size} 条数据") results.append(chunk) offset += batch_size df = pd.concat(results, ignore_index=True) all_attrs = {} for mapping in df["规格属性映射"].dropna(): for attr_name, val in mapping.items(): if attr_name not in all_attrs: # 只保留一个样本 all_attrs[attr_name] = val print(all_attrs)
In [ ]:
# 保存 out_df = pd.DataFrame(list(all_attrs.items()), columns=["规格属性映射", "属性值"]) out_df.to_excel(f"D:/test/logistics/test_excel/{leimu}-规格属性映射.xlsx", index=False)
In [ ]: