logistics/各条目规格.ipynb

119 lines
3.5 KiB
Plaintext
Raw Normal View History

2025-11-26 14:34:04 +08:00
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 属性统计\n",
"def parse_attr_set(attr_set_str):\n",
" \"\"\"解析属性集,返回 {属性值: 属性名} 的字典\"\"\"\n",
" mapping = {}\n",
" for item in attr_set_str.split(\";\"):\n",
" item = item.strip()\n",
" if not item:\n",
" continue\n",
" try:\n",
" left, value = item.split(\"~\", 1) # \"231012:大小~23206184:140*80*75\"\n",
" attr_name = left.split(\":\")[1]\n",
" value = value.split(\":\")[1]\n",
" mapping[value] = attr_name\n",
" except Exception:\n",
" continue\n",
" return mapping\n",
"def map_spec_to_attrs(spec, attr_mapping):\n",
" \"\"\"把规格里的值映射为 {属性名: 规格值}\"\"\"\n",
" results = {}\n",
" if not isinstance(attr_mapping, dict): # 如果不是字典,直接返回 None\n",
" return None\n",
"\n",
" for val in str(spec).split(): # spec 也转成 str避免 NaN\n",
" if val in attr_mapping:\n",
" attr_name = attr_mapping[val]\n",
" results[attr_name] = val\n",
" return results if results else None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from utils.gtools import MySQLconnect\n",
"\n",
"batch_size = 1000000\n",
"offset = 0\n",
"results = []\n",
"leimu = \"furniture\"\n",
"with MySQLconnect('ods') as ods:\n",
" while True:\n",
" sql = f\"\"\"\n",
" SELECT * FROM erp_{leimu}_sku\n",
" LIMIT {batch_size} OFFSET {offset}\n",
" \"\"\"\n",
" chunk = pd.read_sql(sql, ods.con)\n",
" if chunk.empty:\n",
" break\n",
" chunk[\"规格属性映射\"] = chunk.apply(\n",
" lambda row: map_spec_to_attrs(row[\"规格\"], parse_attr_set(row[\"标准/预设属性集\"])),\n",
" axis=1\n",
" )\n",
" print(f\"处理了 {offset} 到 {offset+batch_size} 条数据\")\n",
" results.append(chunk)\n",
" offset += batch_size\n",
"df = pd.concat(results, ignore_index=True)\n",
"all_attrs = {}\n",
"for mapping in df[\"规格属性映射\"].dropna():\n",
" for attr_name, val in mapping.items():\n",
" if attr_name not in all_attrs: # 只保留一个样本\n",
" all_attrs[attr_name] = val\n",
"\n",
"print(all_attrs)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 保存\n",
"out_df = pd.DataFrame(list(all_attrs.items()), columns=[\"规格属性映射\", \"属性值\"])\n",
"out_df.to_excel(f\"D:/test/logistics/test_excel/{leimu}-规格属性映射.xlsx\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}