logistics/各条目规格.ipynb

119 lines
3.5 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 属性统计\n",
"def parse_attr_set(attr_set_str):\n",
" \"\"\"解析属性集,返回 {属性值: 属性名} 的字典\"\"\"\n",
" mapping = {}\n",
" for item in attr_set_str.split(\";\"):\n",
" item = item.strip()\n",
" if not item:\n",
" continue\n",
" try:\n",
" left, value = item.split(\"~\", 1) # \"231012:大小~23206184:140*80*75\"\n",
" attr_name = left.split(\":\")[1]\n",
" value = value.split(\":\")[1]\n",
" mapping[value] = attr_name\n",
" except Exception:\n",
" continue\n",
" return mapping\n",
"def map_spec_to_attrs(spec, attr_mapping):\n",
" \"\"\"把规格里的值映射为 {属性名: 规格值}\"\"\"\n",
" results = {}\n",
" if not isinstance(attr_mapping, dict): # 如果不是字典,直接返回 None\n",
" return None\n",
"\n",
" for val in str(spec).split(): # spec 也转成 str避免 NaN\n",
" if val in attr_mapping:\n",
" attr_name = attr_mapping[val]\n",
" results[attr_name] = val\n",
" return results if results else None"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from utils.gtools import MySQLconnect\n",
"\n",
"batch_size = 1000000\n",
"offset = 0\n",
"results = []\n",
"leimu = \"furniture\"\n",
"with MySQLconnect('ods') as ods:\n",
" while True:\n",
" sql = f\"\"\"\n",
" SELECT * FROM erp_{leimu}_sku\n",
" LIMIT {batch_size} OFFSET {offset}\n",
" \"\"\"\n",
" chunk = pd.read_sql(sql, ods.con)\n",
" if chunk.empty:\n",
" break\n",
" chunk[\"规格属性映射\"] = chunk.apply(\n",
" lambda row: map_spec_to_attrs(row[\"规格\"], parse_attr_set(row[\"标准/预设属性集\"])),\n",
" axis=1\n",
" )\n",
" print(f\"处理了 {offset} 到 {offset+batch_size} 条数据\")\n",
" results.append(chunk)\n",
" offset += batch_size\n",
"df = pd.concat(results, ignore_index=True)\n",
"all_attrs = {}\n",
"for mapping in df[\"规格属性映射\"].dropna():\n",
" for attr_name, val in mapping.items():\n",
" if attr_name not in all_attrs: # 只保留一个样本\n",
" all_attrs[attr_name] = val\n",
"\n",
"print(all_attrs)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 保存\n",
"out_df = pd.DataFrame(list(all_attrs.items()), columns=[\"规格属性映射\", \"属性值\"])\n",
"out_df.to_excel(f\"D:/test/logistics/test_excel/{leimu}-规格属性映射.xlsx\", index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}