logistics/改SPU尺寸.ipynb

545 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"取数据"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"from utils.gtools import MySQLconnect\n",
"\n",
"# 读取需要计算的包裹信息\n",
"with MySQLconnect('ods') as db:\n",
" sql = r\"\"\" \n",
" # 限制范围是测量时间取得SKU种类为1且数量为1的订单且重复SKU只取最近的订单\n",
"# 测量时间D +2 天进行汇总数据\n",
"# 订单汇总产品数和取出\n",
"# 测量时间D +2 天进行汇总数据\n",
"# 订单汇总产品数和取出\n",
"WITH\n",
"t1 AS (\n",
"SELECT\n",
"order_id,\n",
"SKU,\n",
"order_date,\n",
"sum(CASE WHEN opl.order_product_id REGEXP \"[0-9]{15}_[0-9]*$\"\n",
" THEN product_num END) AS product_num,\n",
"DATE_FORMAT(order_date,\"%Y-%m-%d\") AS 订单时间,\n",
"count(DISTINCT opl.SKU) AS 产品种类\n",
"FROM\n",
"dws.order_product_list opl\n",
"WHERE\n",
" NOT EXISTS (\n",
" SELECT 1 \n",
" FROM dws.log_order_reissue_detail AS r \n",
" WHERE left(r.order_product_id,15) = opl.order_id\n",
" \n",
" )\n",
"AND order_date >= \"2025-05-01\"\n",
"AND order_date < \"2025-09-18\"\n",
"AND SKU <> \"\"\n",
"GROUP BY order_id\n",
")\n",
",\n",
"t2 AS (\n",
"SELECT \n",
" a.`包裹测量时间`,\n",
" t1.order_id,\n",
" t1.SKU,\n",
" t1.order_date,\n",
" a.包裹号,\n",
" a.快递公司,\n",
" a.运输方式,\n",
" a.`目的国`,\n",
" d.postcode,\n",
" CONCAT(\n",
" '\"', b.package, '\": {',\n",
" '\"长\": ', length, ', ',\n",
" '\"宽\": ', width, ', ',\n",
" '\"高\": ', hight, ', ',\n",
" '\"重量\": ', weight, '}'\n",
" ) AS package_json\n",
" FROM\n",
" t1\n",
" LEFT JOIN order_express a ON t1.order_id = a.单号\n",
" JOIN package_vol_info b ON a.`包裹号` = b.package\n",
" JOIN order_list d ON a.`单号` = d.order_id \n",
" WHERE\n",
" a.`包裹状态` != '--'\n",
" AND b.hight > 0 \n",
" AND b.length > 0 \n",
" AND b.width > 0 \n",
" AND b.hight > 0 \n",
" AND b.weight > 0\n",
" AND t1.product_num = 1\n",
" AND t1.产品种类=1\n",
" AND a.`包裹测量时间` >= '2025-06-01'\n",
" AND a.`包裹测量时间` < '2025-09-16'\n",
"),\n",
"t3 AS (\n",
"SELECT\n",
"t2.*,\n",
"SPU,\n",
"sku.成本价 AS ERP采购价,\n",
"CONCAT('{', GROUP_CONCAT(package_json SEPARATOR ','), '}') AS 实际包裹数据,\n",
"count(package_json) AS 包裹数,\n",
"ROW_NUMBER() OVER (PARTITION BY SKU ORDER BY 包裹测量时间 DESC) as rn\n",
"FROM\n",
"t2\n",
"LEFT JOIN stg_bayshop_litfad_sku sku ON t2.SKU=sku.SKU\n",
"left JOIN stg_bayshop_litfad_spu spu ON sku.产品PID=spu.产品PID\n",
"GROUP BY order_id\n",
")\n",
"SELECT\n",
"包裹测量时间,\n",
"order_id,\n",
"SPU,\n",
"SKU,\n",
"DATE_FORMAT(order_date,\"%Y-%m-%D\") AS 订单时间,\n",
"包裹号,\n",
"`快递公司`,\n",
"`运输方式`,\n",
"`目的国`,\n",
"postcode,\n",
"ERP采购价,\n",
"实际包裹数据,\n",
"包裹数,\n",
"rn AS 从新到旧\n",
"FROM\n",
"t3\n",
"\n",
" \"\"\"\n",
" df=pd.read_sql(sql,db.con)\n",
" print(df)\n",
" df.to_clipboard(index=False)\n",
"\n",
"# df=df[df['实际包裹数量']==1]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"拆开实际包裹数据并标记为1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 先把ERP包裹数据拆出来\n",
"import re\n",
"import json\n",
"base_df=df.copy()\n",
"for index, row in base_df.iterrows():\n",
" if not isinstance(row['实际包裹数据'], str) or not row['实际包裹数据']:\n",
" print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n",
" continue\n",
" try:\n",
" package_dict = json.loads(row['实际包裹数据'])\n",
" except json.JSONDecodeError as e:\n",
" print(f\"解析失败:第{index}行,错误信息:{e}\")\n",
" continue\n",
" count = len(package_dict)\n",
" print(f\"第{index}行,包裹数量:{count}\")\n",
" if count !=1:\n",
" print(f\"第{index}行包裹数量不为1跳过\")\n",
" continue\n",
" try:\n",
" for package in package_dict.values():\n",
" item = {}\n",
"\n",
" for key, value in package.items():\n",
" try:\n",
" # 使用正则表达式提取数字部分\n",
" number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n",
" if number_str:\n",
" item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n",
" else:\n",
" item[key] = value # 如果没有数字部分,保留原值\n",
" except ValueError:\n",
" item[key] = value # 如果遇到无法转换的值,保留原值\n",
" except AttributeError:\n",
" print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n",
" continue\n",
" size = []\n",
" size.append(item['长'])\n",
" size.append(item['宽'])\n",
" size.append(item['高'])\n",
" weight = item['重量']\n",
" size.sort()\n",
" length = size[2]\n",
" width = size[1]\n",
" height = size[0]\n",
" base_df.loc[index,'is_first'] = 1\n",
" base_df.loc[index, '长'] = length\n",
" base_df.loc[index, '宽'] = width\n",
" base_df.loc[index, '高'] = height\n",
" base_df.loc[index, '重量'] = weight\n",
"\n",
" print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n",
"base_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"取SPU下所有SKU"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# 取这些SPU下的所有SKU及其现在售价\n",
"# from sell.sell_price import call_sell_and_order_price\n",
"import json\n",
"from utils.gtools import MySQLconnect\n",
"import pandas as pd\n",
"base_df=df.copy()\n",
"spu_list = (\n",
" base_df['SPU']\n",
" .apply(pd.to_numeric, errors='coerce')\n",
" .dropna()\n",
" .astype(int)\n",
" .astype(str)\n",
" .drop_duplicates() # 加这一行\n",
" .tolist()\n",
")\n",
"\n",
"def chunk_list(lst, size):\n",
" for i in range(0, len(lst), size):\n",
" yield lst[i:i+size]\n",
"\n",
"result_list = []\n",
"with MySQLconnect('ods') as db:\n",
" enginal = db.engine()\n",
" for chunk in chunk_list(spu_list, 100):\n",
" quoted_spus = ','.join([f\"'{spu}'\" for spu in chunk]) # 加引号防止 SQL 错误\n",
" sql = f\"\"\"\n",
" SELECT\n",
" 产品品类,\n",
" 产品分类,\n",
" SPU,\n",
" SKU,\n",
" sku.成本价,\n",
" 物流分摊,\n",
" 产品售价\n",
" from stg_bayshop_litfad_spu spu \n",
" LEFT JOIN stg_bayshop_litfad_sku sku ON sku.产品PID = spu.产品PID\n",
" WHERE spu.SPU IN ({quoted_spus})\n",
" \"\"\"\n",
" df_chunk = pd.read_sql(sql, enginal)\n",
" result_list.append(df_chunk)\n",
" print(f\"已处理 {len(result_list) * 100} 个SPU\")\n",
"\n",
"result = pd.concat(result_list, ignore_index=True)\n",
"\n",
"# 合并df\n",
"all_df = pd.merge(result,base_df, on=['SPU','SKU'], how='left')\n",
"all_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 所有的SKU 分类和汇总 ->层级一样的SKU"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 按 SPU 分组;\n",
"# 在组内按 成本价升序 排序;\n",
"# 成本价相同的 SKU 属于同一个层次;\n",
"# 层次号就是「第几种不同的成本价」。# \n",
"all_df['层次'] = all_df.groupby('SPU')['成本价'].rank(method='dense').astype(int)\n",
"all_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 哪几个层级有实际数据,估算其他没有数据的层级的数据\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 先把ERP包裹数据拆出来\n",
"import re\n",
"import json\n",
"for index, row in all_df.iterrows():\n",
" if not isinstance(row['ERP包裹数据'], str) or not row['ERP包裹数据']:\n",
" print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n",
" continue\n",
" try:\n",
" package_dict = json.loads(row['ERP包裹数据'])\n",
" except json.JSONDecodeError as e:\n",
" print(f\"解析失败:第{index}行,错误信息:{e}\")\n",
" continue\n",
" count = len(package_dict)\n",
" print(f\"第{index}行,包裹数量:{count}\")\n",
" if count !=1:\n",
" print(f\"第{index}行包裹数量不为1跳过\")\n",
" continue\n",
" \n",
" try:\n",
" for package in package_dict.values():\n",
" item = {}\n",
"\n",
" for key, value in package.items():\n",
" try:\n",
" # 使用正则表达式提取数字部分\n",
" number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n",
" if number_str:\n",
" item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n",
" else:\n",
" item[key] = value # 如果没有数字部分,保留原值\n",
" except ValueError:\n",
" item[key] = value # 如果遇到无法转换的值,保留原值\n",
" except AttributeError:\n",
" print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n",
" continue\n",
" size = []\n",
" size.append(item['长'])\n",
" size.append(item['宽'])\n",
" size.append(item['高'])\n",
" weight = item['重量']\n",
" size.sort()\n",
" length = size[2]\n",
" width = size[1]\n",
" height = size[0]\n",
" all_df.loc[index, 'ERP包裹数'] = count\n",
" all_df.loc[index, 'ERP长'] = length\n",
" all_df.loc[index, 'ERP宽'] = width\n",
" all_df.loc[index, 'ERP高'] = height\n",
" all_df.loc[index, 'ERP重量'] = weight\n",
" print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算每个SPU的长宽高重量系数"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 计算每个SPU的长宽高重量系数\n",
"def cal_size(old,new):\n",
" try:\n",
" old = float(old)\n",
" new = float(new)\n",
" if old == 0:\n",
" return None # 或 return 0防止除以0\n",
" return (new - old) / old\n",
" except (ValueError, TypeError):\n",
" return None # 遇到不能转为 float 的就返回 None\n",
" \n",
"test_df = all_df[all_df['is_first']==1]\n",
"# 取基准数据SPU的系数\n",
"for index, row in test_df.iterrows():\n",
" test_df.loc[index, '长系数'] = cal_size(row['ERP长'],row['长'])\n",
" test_df.loc[index, '宽系数'] = cal_size(row['ERP宽'],row['宽'])\n",
" test_df.loc[index, '高系数'] = cal_size(row['ERP高'],row['高'])\n",
" test_df.loc[index, '重量系数'] = cal_size(row['ERP重量'],row['重量'])\n",
" print(\n",
" f\"{row['SPU']} 的系数为 \"\n",
" f\"{test_df.loc[index, '长系数']}, \"\n",
" f\"{test_df.loc[index, '宽系数']}, \"\n",
" f\"{test_df.loc[index, '高系数']}, \"\n",
" f\"{test_df.loc[index, '重量系数']}\"\n",
" )\n",
"# 将SPU的基准系数合并至all_df\n",
"all_df = pd.merge(all_df, test_df[['SPU', '长系数', '宽系数', '高系数', '重量系数']], on='SPU', how='left')\n",
"all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算每个sku的理论尺寸\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"all_df = pd.read_excel('单包裹SKU售价分析.xlsx')\n",
"\n",
"# 根据系数 得到所有SKU的预测尺寸\n",
"# 根据系数计算每个SKU的理论尺寸\n",
"all_df['理论长'] = ((1 + all_df['长系数']) * all_df['ERP长']).round(2)\n",
"all_df['理论宽'] = ((1 + all_df['宽系数']) * all_df['ERP宽']).round(2)\n",
"all_df['理论高'] = ((1 + all_df['高系数']) * all_df['ERP高']).round(2)\n",
"all_df['理论重量'] = ((1 + all_df['重量系数']) * all_df['ERP重量']).round(2)\n",
"all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"计算三种尺寸下的售价,计算预测后的尺寸下,一票一件订单的售价和订单价格"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# 计算三种尺寸下的售价\n",
"# 计算预测后的尺寸下,一票一件订单的售价和订单价格\n",
"from sell.sell_price import call_sell_and_order_price\n",
"for index,row in all_df.iterrows():\n",
" price = row['成本价']\n",
" package_dict1={}\n",
" package_dict2={}\n",
" package_dict3={}\n",
" try:\n",
" package_dict1['包裹1'] = {}\n",
" package_dict2['包裹1'] = {}\n",
" package_dict3['包裹1'] = {}\n",
" package_dict1['包裹1']['长'] = row['长']\n",
" package_dict1['包裹1']['宽'] = row['宽']\n",
" package_dict1['包裹1']['高'] = row['高']\n",
" package_dict1['包裹1']['重量'] = row['重量']\n",
" package_dict2['包裹1']['长'] = row['理论长']\n",
" package_dict2['包裹1']['宽'] = row['理论宽'] \n",
" package_dict2['包裹1']['高'] = row['理论高']\n",
" package_dict2['包裹1']['重量'] = row['理论重量']\n",
" package_dict3['包裹1']['长'] = row['ERP长']\n",
" package_dict3['包裹1']['宽'] = row['ERP宽'] \n",
" package_dict3['包裹1']['高'] = row['ERP高']\n",
" package_dict3['包裹1']['重量'] = row['ERP重量']\n",
" sell_price1, order_price1, order_type1 = call_sell_and_order_price(price, package_dict1,head_type=\"海运\")\n",
" sell_price2, order_price2, order_type2 = call_sell_and_order_price(price, package_dict2,head_type=\"海运\")\n",
" sell_price3, order_price3, order_type3 = call_sell_and_order_price(price, package_dict3,head_type=\"海运\")\n",
" except Exception as e:\n",
" print(f\"SKU: {row['SKU']} 报错: {e}\")\n",
" continue\n",
" if isinstance(sell_price3, (int, float)):\n",
" all_df.loc[index, 'ERP售价'] = sell_price3\n",
" else:\n",
" all_df.loc[index, 'ERP售价'] = sell_price3[0]\n",
" if isinstance(sell_price1, (int, float)):\n",
" all_df.loc[index, '实际体积售价'] = sell_price1\n",
" else:\n",
" all_df.loc[index, '实际体积售价'] = sell_price1[0]\n",
"\n",
" if isinstance(sell_price2, (int, float)):\n",
" all_df.loc[index, '理论体积售价'] = sell_price2\n",
" else:\n",
" all_df.loc[index, '理论体积售价'] = sell_price2[0]\n",
"\n",
"\n",
" all_df.loc[index, 'ERP订单物流'] = order_price3\n",
" all_df.loc[index, '实际体积订单物流'] = order_price1\n",
" all_df.loc[index, '理论体积订单物流'] = order_price2\n",
" # all_df.loc[index, '理论体积订单类型'] = order_type2\n",
" print(f\"SPU: {row['SPU']}, SKU {row['SKU']} ,网站售价: {row['产品售价']}, ERP售价: {sell_price3}, 实际体积售价: {sell_price1}, 理论体积售价: {sell_price2},\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"写成可以上传批量修改尺寸的格式"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"all_df = all_df[(all_df['是否有过修改记录']==\"否\")&(all_df['使用尺寸售价']!=\"ERP售价\")]\n",
"all_df['SPU最大涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(max)\n",
"all_df['SPU最小涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(min)\n",
"\n",
"filtered_df = all_df[(all_df['SPU最大涨幅'] <= 0.5) & (all_df['SPU最小涨幅'] >= -0.5)]\n",
"for index,row in filtered_df.iterrows():\n",
" if row['使用尺寸售价']==\"实际体积售价\":\n",
" length = str(row['长'])\n",
" width = str(row['宽'])\n",
" height = str(row['高'])\n",
" weight = str(row['重量'])\n",
" else:\n",
" length = str(row['理论长'])\n",
" width = str(row['理论宽'])\n",
" height = str(row['理论高'])\n",
" weight = str(row['理论重量'])\n",
" filtered_df.loc[index, '尺寸重量'] = f\"{weight}|{length}*{width}*{height}*1,\"\n",
"\n",
" print(index)\n",
"spu_list = filtered_df['SPU'].unique()\n",
"filtered_df = filtered_df[['SKU','成本价','尺寸重量']]\n",
"filtered_df "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}