logistics/改SPU尺寸.ipynb

545 lines
19 KiB
Plaintext
Raw Normal View History

2025-06-17 13:40:20 +08:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"取数据"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"import pandas as pd\n",
"from utils.gtools import MySQLconnect\n",
"\n",
"# 读取需要计算的包裹信息\n",
"with MySQLconnect('ods') as db:\n",
" sql = r\"\"\" \n",
" # 限制范围是测量时间取得SKU种类为1且数量为1的订单且重复SKU只取最近的订单\n",
2025-11-26 14:34:04 +08:00
"# 测量时间D +2 天进行汇总数据\n",
"# 订单汇总产品数和取出\n",
"# 测量时间D +2 天进行汇总数据\n",
"# 订单汇总产品数和取出\n",
2025-06-17 13:40:20 +08:00
"WITH\n",
"t1 AS (\n",
"SELECT\n",
"order_id,\n",
"SKU,\n",
"order_date,\n",
2025-11-26 14:34:04 +08:00
"sum(CASE WHEN opl.order_product_id REGEXP \"[0-9]{15}_[0-9]*$\"\n",
" THEN product_num END) AS product_num,\n",
2025-06-17 13:40:20 +08:00
"DATE_FORMAT(order_date,\"%Y-%m-%d\") AS 订单时间,\n",
2025-11-26 14:34:04 +08:00
"count(DISTINCT opl.SKU) AS 产品种类\n",
2025-06-17 13:40:20 +08:00
"FROM\n",
2025-11-26 14:34:04 +08:00
"dws.order_product_list opl\n",
2025-06-17 13:40:20 +08:00
"WHERE\n",
" NOT EXISTS (\n",
" SELECT 1 \n",
" FROM dws.log_order_reissue_detail AS r \n",
2025-11-26 14:34:04 +08:00
" WHERE left(r.order_product_id,15) = opl.order_id\n",
" \n",
2025-06-17 13:40:20 +08:00
" )\n",
2025-11-26 14:34:04 +08:00
"AND order_date >= \"2025-05-01\"\n",
"AND order_date < \"2025-09-18\"\n",
2025-06-17 13:40:20 +08:00
"AND SKU <> \"\"\n",
"GROUP BY order_id\n",
")\n",
",\n",
"t2 AS (\n",
2025-11-26 14:34:04 +08:00
"SELECT \n",
2025-06-17 13:40:20 +08:00
" a.`包裹测量时间`,\n",
2025-11-26 14:34:04 +08:00
" t1.order_id,\n",
" t1.SKU,\n",
" t1.order_date,\n",
2025-06-17 13:40:20 +08:00
" a.包裹号,\n",
" a.快递公司,\n",
" a.运输方式,\n",
2025-11-26 14:34:04 +08:00
" a.`目的国`,\n",
2025-06-17 13:40:20 +08:00
" d.postcode,\n",
" CONCAT(\n",
" '\"', b.package, '\": {',\n",
" '\"长\": ', length, ', ',\n",
" '\"宽\": ', width, ', ',\n",
" '\"高\": ', hight, ', ',\n",
" '\"重量\": ', weight, '}'\n",
" ) AS package_json\n",
" FROM\n",
2025-11-26 14:34:04 +08:00
" t1\n",
2025-06-17 13:40:20 +08:00
" LEFT JOIN order_express a ON t1.order_id = a.单号\n",
" JOIN package_vol_info b ON a.`包裹号` = b.package\n",
" JOIN order_list d ON a.`单号` = d.order_id \n",
" WHERE\n",
2025-11-26 14:34:04 +08:00
" a.`包裹状态` != '--'\n",
2025-06-17 13:40:20 +08:00
" AND b.hight > 0 \n",
" AND b.length > 0 \n",
" AND b.width > 0 \n",
" AND b.hight > 0 \n",
" AND b.weight > 0\n",
2025-11-26 14:34:04 +08:00
" AND t1.product_num = 1\n",
" AND t1.产品种类=1\n",
" AND a.`包裹测量时间` >= '2025-06-01'\n",
" AND a.`包裹测量时间` < '2025-09-16'\n",
2025-06-17 13:40:20 +08:00
"),\n",
"t3 AS (\n",
"SELECT\n",
"t2.*,\n",
2025-11-26 14:34:04 +08:00
"SPU,\n",
"sku.成本价 AS ERP采购价,\n",
2025-06-17 13:40:20 +08:00
"CONCAT('{', GROUP_CONCAT(package_json SEPARATOR ','), '}') AS 实际包裹数据,\n",
2025-11-26 14:34:04 +08:00
"count(package_json) AS 包裹数,\n",
"ROW_NUMBER() OVER (PARTITION BY SKU ORDER BY 包裹测量时间 DESC) as rn\n",
2025-06-17 13:40:20 +08:00
"FROM\n",
"t2\n",
"LEFT JOIN stg_bayshop_litfad_sku sku ON t2.SKU=sku.SKU\n",
2025-11-26 14:34:04 +08:00
"left JOIN stg_bayshop_litfad_spu spu ON sku.产品PID=spu.产品PID\n",
2025-06-17 13:40:20 +08:00
"GROUP BY order_id\n",
")\n",
"SELECT\n",
2025-11-26 14:34:04 +08:00
"包裹测量时间,\n",
2025-06-17 13:40:20 +08:00
"order_id,\n",
2025-11-26 14:34:04 +08:00
"SPU,\n",
2025-06-17 13:40:20 +08:00
"SKU,\n",
2025-11-26 14:34:04 +08:00
"DATE_FORMAT(order_date,\"%Y-%m-%D\") AS 订单时间,\n",
"包裹号,\n",
"`快递公司`,\n",
"`运输方式`,\n",
"`目的国`,\n",
"postcode,\n",
"ERP采购价,\n",
2025-06-17 13:40:20 +08:00
"实际包裹数据,\n",
2025-11-26 14:34:04 +08:00
"包裹数,\n",
"rn AS 从新到旧\n",
2025-06-17 13:40:20 +08:00
"FROM\n",
"t3\n",
"\n",
" \"\"\"\n",
" df=pd.read_sql(sql,db.con)\n",
" print(df)\n",
" df.to_clipboard(index=False)\n",
"\n",
2025-11-26 14:34:04 +08:00
"# df=df[df['实际包裹数量']==1]\n",
2025-06-17 13:40:20 +08:00
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"拆开实际包裹数据并标记为1"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"# 先把ERP包裹数据拆出来\n",
"import re\n",
"import json\n",
"base_df=df.copy()\n",
"for index, row in base_df.iterrows():\n",
" if not isinstance(row['实际包裹数据'], str) or not row['实际包裹数据']:\n",
" print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n",
" continue\n",
" try:\n",
" package_dict = json.loads(row['实际包裹数据'])\n",
" except json.JSONDecodeError as e:\n",
" print(f\"解析失败:第{index}行,错误信息:{e}\")\n",
" continue\n",
" count = len(package_dict)\n",
" print(f\"第{index}行,包裹数量:{count}\")\n",
" if count !=1:\n",
" print(f\"第{index}行包裹数量不为1跳过\")\n",
" continue\n",
" try:\n",
" for package in package_dict.values():\n",
" item = {}\n",
"\n",
" for key, value in package.items():\n",
" try:\n",
" # 使用正则表达式提取数字部分\n",
" number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n",
" if number_str:\n",
" item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n",
" else:\n",
" item[key] = value # 如果没有数字部分,保留原值\n",
" except ValueError:\n",
" item[key] = value # 如果遇到无法转换的值,保留原值\n",
" except AttributeError:\n",
" print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n",
" continue\n",
" size = []\n",
" size.append(item['长'])\n",
" size.append(item['宽'])\n",
" size.append(item['高'])\n",
" weight = item['重量']\n",
" size.sort()\n",
" length = size[2]\n",
" width = size[1]\n",
" height = size[0]\n",
" base_df.loc[index,'is_first'] = 1\n",
" base_df.loc[index, '长'] = length\n",
" base_df.loc[index, '宽'] = width\n",
" base_df.loc[index, '高'] = height\n",
" base_df.loc[index, '重量'] = weight\n",
"\n",
" print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n",
"base_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"取SPU下所有SKU"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"\n",
"# 取这些SPU下的所有SKU及其现在售价\n",
2025-11-26 14:34:04 +08:00
"# from sell.sell_price import call_sell_and_order_price\n",
2025-06-17 13:40:20 +08:00
"import json\n",
"from utils.gtools import MySQLconnect\n",
"import pandas as pd\n",
2025-11-26 14:34:04 +08:00
"base_df=df.copy()\n",
2025-06-17 13:40:20 +08:00
"spu_list = (\n",
" base_df['SPU']\n",
" .apply(pd.to_numeric, errors='coerce')\n",
" .dropna()\n",
" .astype(int)\n",
" .astype(str)\n",
" .drop_duplicates() # 加这一行\n",
" .tolist()\n",
")\n",
"\n",
"def chunk_list(lst, size):\n",
" for i in range(0, len(lst), size):\n",
" yield lst[i:i+size]\n",
"\n",
"result_list = []\n",
"with MySQLconnect('ods') as db:\n",
" enginal = db.engine()\n",
2025-11-26 14:34:04 +08:00
" for chunk in chunk_list(spu_list, 100):\n",
2025-06-17 13:40:20 +08:00
" quoted_spus = ','.join([f\"'{spu}'\" for spu in chunk]) # 加引号防止 SQL 错误\n",
" sql = f\"\"\"\n",
" SELECT\n",
" 产品品类,\n",
" 产品分类,\n",
2025-11-26 14:34:04 +08:00
" SPU,\n",
" SKU,\n",
2025-06-17 13:40:20 +08:00
" sku.成本价,\n",
" 物流分摊,\n",
" 产品售价\n",
2025-11-26 14:34:04 +08:00
" from stg_bayshop_litfad_spu spu \n",
" LEFT JOIN stg_bayshop_litfad_sku sku ON sku.产品PID = spu.产品PID\n",
" WHERE spu.SPU IN ({quoted_spus})\n",
2025-06-17 13:40:20 +08:00
" \"\"\"\n",
" df_chunk = pd.read_sql(sql, enginal)\n",
" result_list.append(df_chunk)\n",
2025-11-26 14:34:04 +08:00
" print(f\"已处理 {len(result_list) * 100} 个SPU\")\n",
2025-06-17 13:40:20 +08:00
"\n",
"result = pd.concat(result_list, ignore_index=True)\n",
"\n",
"# 合并df\n",
"all_df = pd.merge(result,base_df, on=['SPU','SKU'], how='left')\n",
"all_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2025-11-26 14:34:04 +08:00
"# 所有的SKU 分类和汇总 ->层级一样的SKU"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 按 SPU 分组;\n",
"# 在组内按 成本价升序 排序;\n",
"# 成本价相同的 SKU 属于同一个层次;\n",
"# 层次号就是「第几种不同的成本价」。# \n",
"all_df['层次'] = all_df.groupby('SPU')['成本价'].rank(method='dense').astype(int)\n",
"all_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 哪几个层级有实际数据,估算其他没有数据的层级的数据\n"
2025-06-17 13:40:20 +08:00
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"# 先把ERP包裹数据拆出来\n",
"import re\n",
"import json\n",
"for index, row in all_df.iterrows():\n",
" if not isinstance(row['ERP包裹数据'], str) or not row['ERP包裹数据']:\n",
" print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n",
" continue\n",
" try:\n",
" package_dict = json.loads(row['ERP包裹数据'])\n",
" except json.JSONDecodeError as e:\n",
" print(f\"解析失败:第{index}行,错误信息:{e}\")\n",
" continue\n",
" count = len(package_dict)\n",
" print(f\"第{index}行,包裹数量:{count}\")\n",
" if count !=1:\n",
" print(f\"第{index}行包裹数量不为1跳过\")\n",
" continue\n",
" \n",
" try:\n",
" for package in package_dict.values():\n",
" item = {}\n",
"\n",
" for key, value in package.items():\n",
" try:\n",
" # 使用正则表达式提取数字部分\n",
" number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n",
" if number_str:\n",
" item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n",
" else:\n",
" item[key] = value # 如果没有数字部分,保留原值\n",
" except ValueError:\n",
" item[key] = value # 如果遇到无法转换的值,保留原值\n",
" except AttributeError:\n",
" print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n",
" continue\n",
" size = []\n",
" size.append(item['长'])\n",
" size.append(item['宽'])\n",
" size.append(item['高'])\n",
" weight = item['重量']\n",
" size.sort()\n",
" length = size[2]\n",
" width = size[1]\n",
" height = size[0]\n",
" all_df.loc[index, 'ERP包裹数'] = count\n",
" all_df.loc[index, 'ERP长'] = length\n",
" all_df.loc[index, 'ERP宽'] = width\n",
" all_df.loc[index, 'ERP高'] = height\n",
" all_df.loc[index, 'ERP重量'] = weight\n",
" print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算每个SPU的长宽高重量系数"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"# 计算每个SPU的长宽高重量系数\n",
"def cal_size(old,new):\n",
" try:\n",
" old = float(old)\n",
" new = float(new)\n",
" if old == 0:\n",
" return None # 或 return 0防止除以0\n",
" return (new - old) / old\n",
" except (ValueError, TypeError):\n",
" return None # 遇到不能转为 float 的就返回 None\n",
" \n",
"test_df = all_df[all_df['is_first']==1]\n",
"# 取基准数据SPU的系数\n",
"for index, row in test_df.iterrows():\n",
" test_df.loc[index, '长系数'] = cal_size(row['ERP长'],row['长'])\n",
" test_df.loc[index, '宽系数'] = cal_size(row['ERP宽'],row['宽'])\n",
" test_df.loc[index, '高系数'] = cal_size(row['ERP高'],row['高'])\n",
" test_df.loc[index, '重量系数'] = cal_size(row['ERP重量'],row['重量'])\n",
" print(\n",
" f\"{row['SPU']} 的系数为 \"\n",
" f\"{test_df.loc[index, '长系数']}, \"\n",
" f\"{test_df.loc[index, '宽系数']}, \"\n",
" f\"{test_df.loc[index, '高系数']}, \"\n",
" f\"{test_df.loc[index, '重量系数']}\"\n",
" )\n",
"# 将SPU的基准系数合并至all_df\n",
"all_df = pd.merge(all_df, test_df[['SPU', '长系数', '宽系数', '高系数', '重量系数']], on='SPU', how='left')\n",
"all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算每个sku的理论尺寸\n"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"all_df = pd.read_excel('单包裹SKU售价分析.xlsx')\n",
"\n",
"# 根据系数 得到所有SKU的预测尺寸\n",
"# 根据系数计算每个SKU的理论尺寸\n",
"all_df['理论长'] = ((1 + all_df['长系数']) * all_df['ERP长']).round(2)\n",
"all_df['理论宽'] = ((1 + all_df['宽系数']) * all_df['ERP宽']).round(2)\n",
"all_df['理论高'] = ((1 + all_df['高系数']) * all_df['ERP高']).round(2)\n",
"all_df['理论重量'] = ((1 + all_df['重量系数']) * all_df['ERP重量']).round(2)\n",
"all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算三种尺寸下的售价,计算预测后的尺寸下,一票一件订单的售价和订单价格"
]
},
{
"cell_type": "code",
2025-11-26 14:34:04 +08:00
"execution_count": null,
2025-06-17 13:40:20 +08:00
"metadata": {},
2025-11-26 14:34:04 +08:00
"outputs": [],
2025-06-17 13:40:20 +08:00
"source": [
"# 计算三种尺寸下的售价\n",
"# 计算预测后的尺寸下,一票一件订单的售价和订单价格\n",
"from sell.sell_price import call_sell_and_order_price\n",
"for index,row in all_df.iterrows():\n",
" price = row['成本价']\n",
" package_dict1={}\n",
" package_dict2={}\n",
" package_dict3={}\n",
" try:\n",
" package_dict1['包裹1'] = {}\n",
" package_dict2['包裹1'] = {}\n",
" package_dict3['包裹1'] = {}\n",
" package_dict1['包裹1']['长'] = row['长']\n",
" package_dict1['包裹1']['宽'] = row['宽']\n",
" package_dict1['包裹1']['高'] = row['高']\n",
" package_dict1['包裹1']['重量'] = row['重量']\n",
" package_dict2['包裹1']['长'] = row['理论长']\n",
" package_dict2['包裹1']['宽'] = row['理论宽'] \n",
" package_dict2['包裹1']['高'] = row['理论高']\n",
" package_dict2['包裹1']['重量'] = row['理论重量']\n",
" package_dict3['包裹1']['长'] = row['ERP长']\n",
" package_dict3['包裹1']['宽'] = row['ERP宽'] \n",
" package_dict3['包裹1']['高'] = row['ERP高']\n",
" package_dict3['包裹1']['重量'] = row['ERP重量']\n",
" sell_price1, order_price1, order_type1 = call_sell_and_order_price(price, package_dict1,head_type=\"海运\")\n",
" sell_price2, order_price2, order_type2 = call_sell_and_order_price(price, package_dict2,head_type=\"海运\")\n",
" sell_price3, order_price3, order_type3 = call_sell_and_order_price(price, package_dict3,head_type=\"海运\")\n",
" except Exception as e:\n",
" print(f\"SKU: {row['SKU']} 报错: {e}\")\n",
" continue\n",
" if isinstance(sell_price3, (int, float)):\n",
" all_df.loc[index, 'ERP售价'] = sell_price3\n",
" else:\n",
" all_df.loc[index, 'ERP售价'] = sell_price3[0]\n",
" if isinstance(sell_price1, (int, float)):\n",
" all_df.loc[index, '实际体积售价'] = sell_price1\n",
" else:\n",
" all_df.loc[index, '实际体积售价'] = sell_price1[0]\n",
"\n",
" if isinstance(sell_price2, (int, float)):\n",
" all_df.loc[index, '理论体积售价'] = sell_price2\n",
" else:\n",
" all_df.loc[index, '理论体积售价'] = sell_price2[0]\n",
"\n",
"\n",
" all_df.loc[index, 'ERP订单物流'] = order_price3\n",
" all_df.loc[index, '实际体积订单物流'] = order_price1\n",
" all_df.loc[index, '理论体积订单物流'] = order_price2\n",
" # all_df.loc[index, '理论体积订单类型'] = order_type2\n",
" print(f\"SPU: {row['SPU']}, SKU {row['SKU']} ,网站售价: {row['产品售价']}, ERP售价: {sell_price3}, 实际体积售价: {sell_price1}, 理论体积售价: {sell_price2},\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"写成可以上传批量修改尺寸的格式"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_df = all_df[(all_df['是否有过修改记录']==\"否\")&(all_df['使用尺寸售价']!=\"ERP售价\")]\n",
"all_df['SPU最大涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(max)\n",
"all_df['SPU最小涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(min)\n",
"\n",
"filtered_df = all_df[(all_df['SPU最大涨幅'] <= 0.5) & (all_df['SPU最小涨幅'] >= -0.5)]\n",
"for index,row in filtered_df.iterrows():\n",
" if row['使用尺寸售价']==\"实际体积售价\":\n",
" length = str(row['长'])\n",
" width = str(row['宽'])\n",
" height = str(row['高'])\n",
" weight = str(row['重量'])\n",
" else:\n",
" length = str(row['理论长'])\n",
" width = str(row['理论宽'])\n",
" height = str(row['理论高'])\n",
" weight = str(row['理论重量'])\n",
" filtered_df.loc[index, '尺寸重量'] = f\"{weight}|{length}*{width}*{height}*1,\"\n",
"\n",
" print(index)\n",
"spu_list = filtered_df['SPU'].unique()\n",
"filtered_df = filtered_df[['SKU','成本价','尺寸重量']]\n",
"filtered_df "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}