logistics/各条目规格.ipynb

3.5 KiB
Raw Permalink Blame History

In [ ]:
# 属性统计
def parse_attr_set(attr_set_str):
    """解析属性集,返回 {属性值: 属性名} 的字典"""
    mapping = {}
    for item in attr_set_str.split(";"):
        item = item.strip()
        if not item:
            continue
        try:
            left, value = item.split("~", 1)   # "231012:大小~23206184:140*80*75"
            attr_name = left.split(":")[1]
            value = value.split(":")[1]
            mapping[value] = attr_name
        except Exception:
            continue
    return mapping
def map_spec_to_attrs(spec, attr_mapping):
    """把规格里的值映射为 {属性名: 规格值}"""
    results = {}
    if not isinstance(attr_mapping, dict):  # 如果不是字典,直接返回 None
        return None

    for val in str(spec).split():  # spec 也转成 str避免 NaN
        if val in attr_mapping:
            attr_name = attr_mapping[val]
            results[attr_name] = val
    return results if results else None
In [ ]:
import pandas as pd
from utils.gtools import MySQLconnect

batch_size = 1000000
offset = 0
results = []
leimu = "furniture"
with MySQLconnect('ods') as ods:
    while True:
        sql = f"""
        SELECT * FROM erp_{leimu}_sku
        LIMIT {batch_size} OFFSET {offset}
        """
        chunk = pd.read_sql(sql, ods.con)
        if chunk.empty:
            break
        chunk["规格属性映射"] = chunk.apply(
            lambda row: map_spec_to_attrs(row["规格"], parse_attr_set(row["标准/预设属性集"])),
            axis=1
        )
        print(f"处理了 {offset}{offset+batch_size} 条数据")
        results.append(chunk)
        offset += batch_size
df = pd.concat(results, ignore_index=True)
all_attrs = {}
for mapping in df["规格属性映射"].dropna():
    for attr_name, val in mapping.items():
        if attr_name not in all_attrs:  # 只保留一个样本
            all_attrs[attr_name] = val

print(all_attrs)
In [ ]:
# 保存
out_df = pd.DataFrame(list(all_attrs.items()), columns=["规格属性映射", "属性值"])
out_df.to_excel(f"D:/test/logistics/test_excel/{leimu}-规格属性映射.xlsx", index=False)
In [ ]: