{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "取数据" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from utils.gtools import MySQLconnect\n", "\n", "# 读取需要计算的包裹信息\n", "with MySQLconnect('ods') as db:\n", " sql = r\"\"\" \n", " # 限制范围是测量时间,取得SKU种类为1且数量为1的订单,且重复SKU只取最近的订单\n", "# 测量时间D +2 天进行汇总数据\n", "# 订单汇总产品数和取出\n", "# 测量时间D +2 天进行汇总数据\n", "# 订单汇总产品数和取出\n", "WITH\n", "t1 AS (\n", "SELECT\n", "order_id,\n", "SKU,\n", "order_date,\n", "sum(CASE WHEN opl.order_product_id REGEXP \"[0-9]{15}_[0-9]*$\"\n", " THEN product_num END) AS product_num,\n", "DATE_FORMAT(order_date,\"%Y-%m-%d\") AS 订单时间,\n", "count(DISTINCT opl.SKU) AS 产品种类\n", "FROM\n", "dws.order_product_list opl\n", "WHERE\n", " NOT EXISTS (\n", " SELECT 1 \n", " FROM dws.log_order_reissue_detail AS r \n", " WHERE left(r.order_product_id,15) = opl.order_id\n", " \n", " )\n", "AND order_date >= \"2025-05-01\"\n", "AND order_date < \"2025-09-18\"\n", "AND SKU <> \"\"\n", "GROUP BY order_id\n", ")\n", ",\n", "t2 AS (\n", "SELECT \n", " a.`包裹测量时间`,\n", " t1.order_id,\n", " t1.SKU,\n", " t1.order_date,\n", " a.包裹号,\n", " a.快递公司,\n", " a.运输方式,\n", " a.`目的国`,\n", " d.postcode,\n", " CONCAT(\n", " '\"', b.package, '\": {',\n", " '\"长\": ', length, ', ',\n", " '\"宽\": ', width, ', ',\n", " '\"高\": ', hight, ', ',\n", " '\"重量\": ', weight, '}'\n", " ) AS package_json\n", " FROM\n", " t1\n", " LEFT JOIN order_express a ON t1.order_id = a.单号\n", " JOIN package_vol_info b ON a.`包裹号` = b.package\n", " JOIN order_list d ON a.`单号` = d.order_id \n", " WHERE\n", " a.`包裹状态` != '--'\n", " AND b.hight > 0 \n", " AND b.length > 0 \n", " AND b.width > 0 \n", " AND b.hight > 0 \n", " AND b.weight > 0\n", " AND t1.product_num = 1\n", " AND t1.产品种类=1\n", " AND a.`包裹测量时间` >= '2025-06-01'\n", " AND a.`包裹测量时间` < '2025-09-16'\n", "),\n", "t3 AS (\n", "SELECT\n", "t2.*,\n", "SPU,\n", "sku.成本价 AS ERP采购价,\n", "CONCAT('{', GROUP_CONCAT(package_json SEPARATOR ','), '}') AS 实际包裹数据,\n", "count(package_json) AS 包裹数,\n", "ROW_NUMBER() OVER (PARTITION BY SKU ORDER BY 包裹测量时间 DESC) as rn\n", "FROM\n", "t2\n", "LEFT JOIN stg_bayshop_litfad_sku sku ON t2.SKU=sku.SKU\n", "left JOIN stg_bayshop_litfad_spu spu ON sku.产品PID=spu.产品PID\n", "GROUP BY order_id\n", ")\n", "SELECT\n", "包裹测量时间,\n", "order_id,\n", "SPU,\n", "SKU,\n", "DATE_FORMAT(order_date,\"%Y-%m-%D\") AS 订单时间,\n", "包裹号,\n", "`快递公司`,\n", "`运输方式`,\n", "`目的国`,\n", "postcode,\n", "ERP采购价,\n", "实际包裹数据,\n", "包裹数,\n", "rn AS 从新到旧\n", "FROM\n", "t3\n", "\n", " \"\"\"\n", " df=pd.read_sql(sql,db.con)\n", " print(df)\n", " df.to_clipboard(index=False)\n", "\n", "# df=df[df['实际包裹数量']==1]\n", "df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "拆开实际包裹数据,并标记为1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 先把ERP包裹数据拆出来\n", "import re\n", "import json\n", "base_df=df.copy()\n", "for index, row in base_df.iterrows():\n", " if not isinstance(row['实际包裹数据'], str) or not row['实际包裹数据']:\n", " print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n", " continue\n", " try:\n", " package_dict = json.loads(row['实际包裹数据'])\n", " except json.JSONDecodeError as e:\n", " print(f\"解析失败:第{index}行,错误信息:{e}\")\n", " continue\n", " count = len(package_dict)\n", " print(f\"第{index}行,包裹数量:{count}\")\n", " if count !=1:\n", " print(f\"第{index}行,包裹数量不为1,跳过\")\n", " continue\n", " try:\n", " for package in package_dict.values():\n", " item = {}\n", "\n", " for key, value in package.items():\n", " try:\n", " # 使用正则表达式提取数字部分\n", " number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n", " if number_str:\n", " item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n", " else:\n", " item[key] = value # 如果没有数字部分,保留原值\n", " except ValueError:\n", " item[key] = value # 如果遇到无法转换的值,保留原值\n", " except AttributeError:\n", " print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n", " continue\n", " size = []\n", " size.append(item['长'])\n", " size.append(item['宽'])\n", " size.append(item['高'])\n", " weight = item['重量']\n", " size.sort()\n", " length = size[2]\n", " width = size[1]\n", " height = size[0]\n", " base_df.loc[index,'is_first'] = 1\n", " base_df.loc[index, '长'] = length\n", " base_df.loc[index, '宽'] = width\n", " base_df.loc[index, '高'] = height\n", " base_df.loc[index, '重量'] = weight\n", "\n", " print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n", "base_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "取SPU下所有SKU" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# 取这些SPU下的所有SKU及其现在售价\n", "# from sell.sell_price import call_sell_and_order_price\n", "import json\n", "from utils.gtools import MySQLconnect\n", "import pandas as pd\n", "base_df=df.copy()\n", "spu_list = (\n", " base_df['SPU']\n", " .apply(pd.to_numeric, errors='coerce')\n", " .dropna()\n", " .astype(int)\n", " .astype(str)\n", " .drop_duplicates() # 加这一行\n", " .tolist()\n", ")\n", "\n", "def chunk_list(lst, size):\n", " for i in range(0, len(lst), size):\n", " yield lst[i:i+size]\n", "\n", "result_list = []\n", "with MySQLconnect('ods') as db:\n", " enginal = db.engine()\n", " for chunk in chunk_list(spu_list, 100):\n", " quoted_spus = ','.join([f\"'{spu}'\" for spu in chunk]) # 加引号防止 SQL 错误\n", " sql = f\"\"\"\n", " SELECT\n", " 产品品类,\n", " 产品分类,\n", " SPU,\n", " SKU,\n", " sku.成本价,\n", " 物流分摊,\n", " 产品售价\n", " from stg_bayshop_litfad_spu spu \n", " LEFT JOIN stg_bayshop_litfad_sku sku ON sku.产品PID = spu.产品PID\n", " WHERE spu.SPU IN ({quoted_spus})\n", " \"\"\"\n", " df_chunk = pd.read_sql(sql, enginal)\n", " result_list.append(df_chunk)\n", " print(f\"已处理 {len(result_list) * 100} 个SPU\")\n", "\n", "result = pd.concat(result_list, ignore_index=True)\n", "\n", "# 合并df\n", "all_df = pd.merge(result,base_df, on=['SPU','SKU'], how='left')\n", "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 所有的SKU 分类和汇总 ->层级一样的SKU" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 按 SPU 分组;\n", "# 在组内按 成本价升序 排序;\n", "# 成本价相同的 SKU 属于同一个层次;\n", "# 层次号就是「第几种不同的成本价」。# \n", "all_df['层次'] = all_df.groupby('SPU')['成本价'].rank(method='dense').astype(int)\n", "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 哪几个层级有实际数据,估算其他没有数据的层级的数据\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 先把ERP包裹数据拆出来\n", "import re\n", "import json\n", "for index, row in all_df.iterrows():\n", " if not isinstance(row['ERP包裹数据'], str) or not row['ERP包裹数据']:\n", " print(f\"第{index}行包裹数据为空或非字符串,跳过\")\n", " continue\n", " try:\n", " package_dict = json.loads(row['ERP包裹数据'])\n", " except json.JSONDecodeError as e:\n", " print(f\"解析失败:第{index}行,错误信息:{e}\")\n", " continue\n", " count = len(package_dict)\n", " print(f\"第{index}行,包裹数量:{count}\")\n", " if count !=1:\n", " print(f\"第{index}行,包裹数量不为1,跳过\")\n", " continue\n", " \n", " try:\n", " for package in package_dict.values():\n", " item = {}\n", "\n", " for key, value in package.items():\n", " try:\n", " # 使用正则表达式提取数字部分\n", " number_str = re.findall(r\"[-+]?\\d*\\.\\d+|\\d+\", str(value))\n", " if number_str:\n", " item[key] = float(number_str[0]) # 取第一个匹配到的数字并转换为 float\n", " else:\n", " item[key] = value # 如果没有数字部分,保留原值\n", " except ValueError:\n", " item[key] = value # 如果遇到无法转换的值,保留原值\n", " except AttributeError:\n", " print(f\"解析失败:第{index}行,错误信息:包裹数据为空\")\n", " continue\n", " size = []\n", " size.append(item['长'])\n", " size.append(item['宽'])\n", " size.append(item['高'])\n", " weight = item['重量']\n", " size.sort()\n", " length = size[2]\n", " width = size[1]\n", " height = size[0]\n", " all_df.loc[index, 'ERP包裹数'] = count\n", " all_df.loc[index, 'ERP长'] = length\n", " all_df.loc[index, 'ERP宽'] = width\n", " all_df.loc[index, 'ERP高'] = height\n", " all_df.loc[index, 'ERP重量'] = weight\n", " print(f\"{row['SKU']}尺寸为:{width},h:{height},d:{length},w:{weight}\")\n", " " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "计算每个SPU的长宽高重量系数" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 计算每个SPU的长宽高重量系数\n", "def cal_size(old,new):\n", " try:\n", " old = float(old)\n", " new = float(new)\n", " if old == 0:\n", " return None # 或 return 0,防止除以0\n", " return (new - old) / old\n", " except (ValueError, TypeError):\n", " return None # 遇到不能转为 float 的就返回 None\n", " \n", "test_df = all_df[all_df['is_first']==1]\n", "# 取基准数据SPU的系数\n", "for index, row in test_df.iterrows():\n", " test_df.loc[index, '长系数'] = cal_size(row['ERP长'],row['长'])\n", " test_df.loc[index, '宽系数'] = cal_size(row['ERP宽'],row['宽'])\n", " test_df.loc[index, '高系数'] = cal_size(row['ERP高'],row['高'])\n", " test_df.loc[index, '重量系数'] = cal_size(row['ERP重量'],row['重量'])\n", " print(\n", " f\"{row['SPU']} 的系数为 \"\n", " f\"{test_df.loc[index, '长系数']}, \"\n", " f\"{test_df.loc[index, '宽系数']}, \"\n", " f\"{test_df.loc[index, '高系数']}, \"\n", " f\"{test_df.loc[index, '重量系数']}\"\n", " )\n", "# 将SPU的基准系数合并至all_df\n", "all_df = pd.merge(all_df, test_df[['SPU', '长系数', '宽系数', '高系数', '重量系数']], on='SPU', how='left')\n", "all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "计算每个sku的理论尺寸\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "all_df = pd.read_excel('单包裹SKU售价分析.xlsx')\n", "\n", "# 根据系数 得到所有SKU的预测尺寸\n", "# 根据系数计算每个SKU的理论尺寸\n", "all_df['理论长'] = ((1 + all_df['长系数']) * all_df['ERP长']).round(2)\n", "all_df['理论宽'] = ((1 + all_df['宽系数']) * all_df['ERP宽']).round(2)\n", "all_df['理论高'] = ((1 + all_df['高系数']) * all_df['ERP高']).round(2)\n", "all_df['理论重量'] = ((1 + all_df['重量系数']) * all_df['ERP重量']).round(2)\n", "all_df.to_excel('单包裹SKU售价分析.xlsx', index=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "计算三种尺寸下的售价,计算预测后的尺寸下,一票一件订单的售价和订单价格" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# 计算三种尺寸下的售价\n", "# 计算预测后的尺寸下,一票一件订单的售价和订单价格\n", "from sell.sell_price import call_sell_and_order_price\n", "for index,row in all_df.iterrows():\n", " price = row['成本价']\n", " package_dict1={}\n", " package_dict2={}\n", " package_dict3={}\n", " try:\n", " package_dict1['包裹1'] = {}\n", " package_dict2['包裹1'] = {}\n", " package_dict3['包裹1'] = {}\n", " package_dict1['包裹1']['长'] = row['长']\n", " package_dict1['包裹1']['宽'] = row['宽']\n", " package_dict1['包裹1']['高'] = row['高']\n", " package_dict1['包裹1']['重量'] = row['重量']\n", " package_dict2['包裹1']['长'] = row['理论长']\n", " package_dict2['包裹1']['宽'] = row['理论宽'] \n", " package_dict2['包裹1']['高'] = row['理论高']\n", " package_dict2['包裹1']['重量'] = row['理论重量']\n", " package_dict3['包裹1']['长'] = row['ERP长']\n", " package_dict3['包裹1']['宽'] = row['ERP宽'] \n", " package_dict3['包裹1']['高'] = row['ERP高']\n", " package_dict3['包裹1']['重量'] = row['ERP重量']\n", " sell_price1, order_price1, order_type1 = call_sell_and_order_price(price, package_dict1,head_type=\"海运\")\n", " sell_price2, order_price2, order_type2 = call_sell_and_order_price(price, package_dict2,head_type=\"海运\")\n", " sell_price3, order_price3, order_type3 = call_sell_and_order_price(price, package_dict3,head_type=\"海运\")\n", " except Exception as e:\n", " print(f\"SKU: {row['SKU']} 报错: {e}\")\n", " continue\n", " if isinstance(sell_price3, (int, float)):\n", " all_df.loc[index, 'ERP售价'] = sell_price3\n", " else:\n", " all_df.loc[index, 'ERP售价'] = sell_price3[0]\n", " if isinstance(sell_price1, (int, float)):\n", " all_df.loc[index, '实际体积售价'] = sell_price1\n", " else:\n", " all_df.loc[index, '实际体积售价'] = sell_price1[0]\n", "\n", " if isinstance(sell_price2, (int, float)):\n", " all_df.loc[index, '理论体积售价'] = sell_price2\n", " else:\n", " all_df.loc[index, '理论体积售价'] = sell_price2[0]\n", "\n", "\n", " all_df.loc[index, 'ERP订单物流'] = order_price3\n", " all_df.loc[index, '实际体积订单物流'] = order_price1\n", " all_df.loc[index, '理论体积订单物流'] = order_price2\n", " # all_df.loc[index, '理论体积订单类型'] = order_type2\n", " print(f\"SPU: {row['SPU']}, SKU {row['SKU']} ,网站售价: {row['产品售价']}, ERP售价: {sell_price3}, 实际体积售价: {sell_price1}, 理论体积售价: {sell_price2},\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "写成可以上传批量修改尺寸的格式" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "all_df = all_df[(all_df['是否有过修改记录']==\"否\")&(all_df['使用尺寸售价']!=\"ERP售价\")]\n", "all_df['SPU最大涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(max)\n", "all_df['SPU最小涨幅']=all_df.groupby('SPU')['售价涨跌幅'].transform(min)\n", "\n", "filtered_df = all_df[(all_df['SPU最大涨幅'] <= 0.5) & (all_df['SPU最小涨幅'] >= -0.5)]\n", "for index,row in filtered_df.iterrows():\n", " if row['使用尺寸售价']==\"实际体积售价\":\n", " length = str(row['长'])\n", " width = str(row['宽'])\n", " height = str(row['高'])\n", " weight = str(row['重量'])\n", " else:\n", " length = str(row['理论长'])\n", " width = str(row['理论宽'])\n", " height = str(row['理论高'])\n", " weight = str(row['理论重量'])\n", " filtered_df.loc[index, '尺寸重量'] = f\"{weight}|{length}*{width}*{height}*1,\"\n", "\n", " print(index)\n", "spu_list = filtered_df['SPU'].unique()\n", "filtered_df = filtered_df[['SKU','成本价','尺寸重量']]\n", "filtered_df " ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.5" } }, "nbformat": 4, "nbformat_minor": 2 }