119 lines
3.5 KiB
Plaintext
119 lines
3.5 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 属性统计\n",
|
||
"def parse_attr_set(attr_set_str):\n",
|
||
" \"\"\"解析属性集,返回 {属性值: 属性名} 的字典\"\"\"\n",
|
||
" mapping = {}\n",
|
||
" for item in attr_set_str.split(\";\"):\n",
|
||
" item = item.strip()\n",
|
||
" if not item:\n",
|
||
" continue\n",
|
||
" try:\n",
|
||
" left, value = item.split(\"~\", 1) # \"231012:大小~23206184:140*80*75\"\n",
|
||
" attr_name = left.split(\":\")[1]\n",
|
||
" value = value.split(\":\")[1]\n",
|
||
" mapping[value] = attr_name\n",
|
||
" except Exception:\n",
|
||
" continue\n",
|
||
" return mapping\n",
|
||
"def map_spec_to_attrs(spec, attr_mapping):\n",
|
||
" \"\"\"把规格里的值映射为 {属性名: 规格值}\"\"\"\n",
|
||
" results = {}\n",
|
||
" if not isinstance(attr_mapping, dict): # 如果不是字典,直接返回 None\n",
|
||
" return None\n",
|
||
"\n",
|
||
" for val in str(spec).split(): # spec 也转成 str,避免 NaN\n",
|
||
" if val in attr_mapping:\n",
|
||
" attr_name = attr_mapping[val]\n",
|
||
" results[attr_name] = val\n",
|
||
" return results if results else None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import pandas as pd\n",
|
||
"from utils.gtools import MySQLconnect\n",
|
||
"\n",
|
||
"batch_size = 1000000\n",
|
||
"offset = 0\n",
|
||
"results = []\n",
|
||
"leimu = \"furniture\"\n",
|
||
"with MySQLconnect('ods') as ods:\n",
|
||
" while True:\n",
|
||
" sql = f\"\"\"\n",
|
||
" SELECT * FROM erp_{leimu}_sku\n",
|
||
" LIMIT {batch_size} OFFSET {offset}\n",
|
||
" \"\"\"\n",
|
||
" chunk = pd.read_sql(sql, ods.con)\n",
|
||
" if chunk.empty:\n",
|
||
" break\n",
|
||
" chunk[\"规格属性映射\"] = chunk.apply(\n",
|
||
" lambda row: map_spec_to_attrs(row[\"规格\"], parse_attr_set(row[\"标准/预设属性集\"])),\n",
|
||
" axis=1\n",
|
||
" )\n",
|
||
" print(f\"处理了 {offset} 到 {offset+batch_size} 条数据\")\n",
|
||
" results.append(chunk)\n",
|
||
" offset += batch_size\n",
|
||
"df = pd.concat(results, ignore_index=True)\n",
|
||
"all_attrs = {}\n",
|
||
"for mapping in df[\"规格属性映射\"].dropna():\n",
|
||
" for attr_name, val in mapping.items():\n",
|
||
" if attr_name not in all_attrs: # 只保留一个样本\n",
|
||
" all_attrs[attr_name] = val\n",
|
||
"\n",
|
||
"print(all_attrs)\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"# 保存\n",
|
||
"out_df = pd.DataFrame(list(all_attrs.items()), columns=[\"规格属性映射\", \"属性值\"])\n",
|
||
"out_df.to_excel(f\"D:/test/logistics/test_excel/{leimu}-规格属性映射.xlsx\", index=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": []
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "base",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.11.5"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 2
|
||
}
|