mysql数据分析实战
使用英国某零售平台交易数据
# 查看特征和类型
# 发票编号、商品名称、简介、数量、日期、单价、用户ID、国家
DESC uk
# 查看前5行
SELECT * FROM uk LIMIT 5;
1 缺失值检测
# 总行数、各字段的行数(是否存在缺失值)
SELECT COUNT(*),COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk;
# 共541909行,其中发票编号、描述、customerid存在缺失
2 异常值检测
数值型的数量和单价可能存在异常
【数量和单价】
SELECT MAX(quantity),MIN(quantity),MAX(unitprice),MIN(unitprice)
FROM uk;
SELECT COUNT(*) FROM uk WHERE quantity<=0 # 10000多条
SELECT COUNT(*) FROM uk WHERE unitprice<=0 # 2条异常
# 去掉2条小于0的,将10000条的变为正数
【样本时间范围】-2010-12-01到2011-12-09
SELECT MAX(invoicedate),MIN(invoicedate) FROM uk;
【每月记录数】-没有很少的
SELECT YEAR(invoicedate),MONTH(invoicedate),COUNT(*)
FROM uk
GROUP BY YEAR(invoicedate),MONTH(invoicedate);
3 重复值检测-536641,541909存在重复值
SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;
4 处理缺失、异常、重复
# quantity<0的变为正
UPDATE uk
SET quantity=quantity*(-1)
WHERE quantity<0
# 删除customerid为空,unitprice<0,quantity=0
DELETE FROM uk
WHERE CustomerID IS NULL
OR unitprice<=0 OR quantity=0
# 再次查看是否缺失,总记录406789,invoiceNo397884条
SELECT COUNT(invoiceno),COUNT(stockcode),COUNT(description),
COUNT(quantity),COUNT(invoicedate),COUNT(unitprice),COUNT(customerid),
COUNT(country) FROM uk
# 再次查看是否存在重复值-存在
SELECT COUNT(*) FROM (SELECT DISTINCT * FROM uk) t;
SELECT COUNT(*) FROM uk;
# 建新表,去重,删去原表
CREATE TABLE uk2 AS
SELECT DISTINCT * FROM uk;
DROP TABLE uk;
# invoiceNo缺失比例计算-各月比例大致相同,直接删除对订单月分布影响不大
SELECT YEAR(invoicedate) `年`,MONTH(invoicedate) `月`,
SUM(InvoiceNo IS NULL) `缺失数`,COUNT(*) `记录数`,ROUND(SUM(InvoiceNo IS NULL)/COUNT(*),2) `缺失比例`
FROM uk2
GROUP BY YEAR(invoicedate),MONTH(invoicedate)
# 删除缺失
DELETE FROM uk2 WHERE Invoiceno IS NULL;
交易总体状况
SELECT COUNT(DISTINCT invoiceno) `总订单量`,ROUND(SUM(unitprice*quantity),2) `总订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,SUM(quantity) `商品总销量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno),2) `连带率`
FROM uk2;
# 每月订单数、订单金额、消费者数量、商品总量
SELECT DATE_FORMAT(invoicedate,'%Y年%m月') `月份`,
COUNT(DISTINCT invoiceno) `订单数`,
ROUND(SUM(unitprice*quantity),2) `订单金额`,
COUNT(DISTINCT customerid) `消费者数量`,
SUM(quantity) `出售商品总量`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT customerid),2) `客单价`,
ROUND(SUM(quantity)/COUNT(DISTINCT customerid),2) `连带率`,
ROUND(SUM(unitprice*quantity)/COUNT(DISTINCT invoiceno),2) `每单平均金额`,
ROUND(SUM(quantity)/COUNT(DISTINCT invoiceno)) `每单平均数量`
FROM uk2
GROUP BY DATE_FORMAT(invoicedate,'%Y年%m月')
# 10月、11月是订单高峰期
一、订单量分布
按交易地区
订单top5的国家
SELECT country,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),3) `比例`
FROM uk2 GROUP BY country ORDER BY `订单数` DESC LIMIT 5;
按交易时间
一个月内各天订单数及比例
SELECT DAY(invoicedate) `日`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),2) `比例`
FROM uk2
GROUP BY DAY(invoicedate)
ORDER BY `日`;
# 月末订单明显少
星期内订单数及比例
SELECT DAYOFWEEK(invoicedate) `星期`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoiceno) FROM uk2),2) `比例`
FROM uk2
GROUP BY DAYOFWEEK(invoicedate)
ORDER BY `星期`;
# 1表示星期日,周六不营业?周四订单量最大
每小时内订单数及比例
SELECT CONCAT(HOUR(invoicedate),'-',HOUR(invoicedate)+1) `时间`,COUNT(DISTINCT invoiceno) `订单数`,
ROUND(COUNT(DISTINCT invoiceno)/(SELECT COUNT(DISTINCT invoicedate) FROM uk2),3) `比例`
FROM uk2
GROUP BY CONCAT(HOUR(invoicedate),'-',HOUR(invoicedate)+1)
ORDER BY HOUR(invoicedate);
# 10到16是集中时间
每单金额、数量及商品单价分布
每单消费金额分布
SELECT `金额`,COUNT(*) `数量`
FROM (
SELECT (CASE
WHEN amount<100 THEN '0-100'
WHEN amount BETWEEN 100 AND 1000 THEN '100-1000'
WHEN amount BETWEEN 1001 AND 2000 THEN '1000-2000'
WHEN amount BETWEEN 2001 AND 3000 THEN '2000-3000'
WHEN amount BETWEEN 3001 AND 4000 THEN '3000-4000'
WHEN amount BETWEEN 4001 AND 5000 THEN '4000-5000'
WHEN amount BETWEEN 5001 AND 6000 THEN '5000-6000'
ELSE '6000以上' END) `金额`
FROM
(
SELECT invoiceno,SUM(unitprice*quantity) amount
FROM uk2 GROUP BY invoiceno
) temp
) t
GROUP BY `金额` ;
商品单价分布
SELECT `价格`,COUNT(*) `数量`
FROM (
SELECT CASE
WHEN unitprice<1 THEN '0-1'
WHEN unitprice BETWEEN 2 AND 5 THEN '2-5'
WHEN unitprice BETWEEN 6 AND 10 THEN '6-10'
WHEN unitprice BETWEEN 11 AND 20 THEN '11-20'
WHEN unitprice BETWEEN 21 AND 30 THEN '21-30'
WHEN unitprice BETWEEN 31 AND 50 THEN '31-50'
WHEN unitprice BETWEEN 51 AND 100 THEN '51-100'
WHEN unitprice BETWEEN 101 AND 300 THEN '101-300'
WHEN unitprice BETWEEN 301 AND 500 THEN '301-500'
WHEN unitprice BETWEEN 501 AND 1000 THEN '501-1000'
ELSE '1000以上' END `价格`
FROM uk2
) temp
GROUP BY `价格`
# 购买量top20的单价
SELECT unitprice,COUNT(*)`频率` FROM uk2 GROUP BY unitprice ORDER BY `频率` DESC LIMIT 20;
3 订购量分布
SELECT `订购量`,COUNT(*) '数量'
FROM
(
SELECT (CASE
WHEN amount<100 THEN '0-100'
WHEN amount BETWEEN 101 AND 500 THEN '100-500'
WHEN amount BETWEEN 501 AND 1000 THEN '500-1000'
WHEN amount BETWEEN 1001 AND 2000 THEN '1000-2000'
WHEN amount BETWEEN 2001 AND 3000 THEN '2000-3000'
ELSE '3000以上' END)`订购量`
FROM
(
SELECT invoiceno,SUM(quantity) amount
FROM uk2 GROUP BY invoiceno
) temp
) t
GROUP BY `订购量`;
每单数量的中位数(四分位数)
SELECT AVG(amount) `中位数`
FROM (
SELECT @index:=@index+1 AS num, amount
FROM (
SELECT invoiceno,SUM(quantity) amount
FROM uk2 GROUP BY invoiceno ORDER BY amount
) temp,(SELECT @index:=0) ti
) t
WHERE num=FLOOR((@index+1)/2) OR num=CEIL((@index+1)/2)
# @index是最后一个index,也就是最大的值
# 下四分位:where num=FLOOR((@index+1)/2),select amount
# 上四分位数:where num=FLOOR(3*(@index+1)/2),select amount
二、每月留存率
2010/12月的消费者为初始,每月的新增消费者在以后月份的留存情况
CREATE VIEW customer_orders AS
SELECT CustomerID,
IF(SUM(YEAR(invoicedate)='2010')>0,1,0) '2010/12',
IF(SUM(MONTH(invoicedate)='1')>0,1,0) '2011/01',
IF(SUM(MONTH(invoicedate)='2')>0,1,0) '2011/02',
IF(SUM(MONTH(invoicedate)='3')>0,1,0) '2011/03',
IF(SUM(MONTH(invoicedate)='4')>0,1,0) '2011/04',
IF(SUM(MONTH(invoicedate)='5')>0,1,0) '2011/05',
IF(SUM(MONTH(invoicedate)='6')>0,1,0) '2011/06',
IF(SUM(MONTH(invoicedate)='7')>0,1,0) '2011/07',
IF(SUM(MONTH(invoicedate)='8')>0,1,0) '2011/08',
IF(SUM(MONTH(invoicedate)='9')>0,1,0) '2011/09',
IF(SUM(MONTH(invoicedate)='10')>0,1,0) '2011/10',
IF(SUM(MONTH(invoicedate)='11')>0,1,0) '2011/11',
IF(SUM(DATE_FORMAT(invoicedate,'%Y/%m')='2011/12')>0,1,0) '2011/12'
FROM uk2
GROUP BY CustomerID;
SELECT * FROM customer_orders;
# 1表示消费了,0表示没有消费
5个月后的留存
CREATE TABLE retention AS
SELECT '2010/12' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/01`=1) `+1`,
SUM(`2011/01`+`2011/02`=2) `+2`,SUM(`2011/01`+`2011/02`+`2011/03`=3) `+3`,
SUM(`2011/01`+`2011/02`+`2011/03`+`2011/04`=4)`+4`,SUM(`2011/01`+`2011/02`+`2011/03`+`2011/04`+`2011/05`=5)`+5`
FROM customer_orders WHERE `2010/12`=1
UNION
SELECT '2011/01' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/02`=1) `+1`,
SUM(`2011/02`+`2011/03`=2) `+2`,SUM(`2011/02`+`2011/03`+`2011/04`=3) `+3`,
SUM(`2011/02`+`2011/03`+`2011/04`+`2011/05`=4)`+4`,SUM(`2011/02`+`2011/03`+`2011/04`+`2011/05`+`2011/06`=5)`+5`
FROM customer_orders WHERE `2010/12`=0 AND `2011/01`=1
UNION
SELECT '2011/02' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/03`=1) `+1`,
SUM(`2011/03`+`2011/04`=2) `+2`,SUM(`2011/03`+`2011/04`+`2011/05`=3) `+3`,
SUM(`2011/03`+`2011/04`+`2011/05`+`2011/06`=4)`+4`,SUM(`2011/03`+`2011/04`+`2011/05`+`2011/06`+`2011/07`=5)`+5`
FROM customer_orders WHERE `2010/12`=`2011/01`=0 AND `2011/02`=1
UNION
SELECT '2011/03' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/04`=1) `+1`,
SUM(`2011/04`+`2011/05`=2) `+2`,SUM(`2011/04`+`2011/05`+`2011/06`=3) `+3`,
SUM(`2011/04`+`2011/05`+`2011/06`+`2011/07`=4)`+4`,SUM(`2011/04`+`2011/05`+`2011/06`+`2011/07`+`2011/08`=5)`+5`
FROM customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=0 AND `2011/03`=1
UNION
SELECT '2011/04' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/02`=5) `+1`,
SUM(`2011/05`+`2011/06`=2) `+2`,SUM(`2011/05`+`2011/06`+`2011/07`=3) `+3`,
SUM(`2011/05`+`2011/06`+`2011/07`+`2011/08`=4)`+4`,SUM(`2011/05`+`2011/06`+`2011/07`+`2011/08`+`2011/09`=5)`+5`
FROM customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=0 AND `2011/04`=1
UNION
SELECT '2011/05' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/06`=1) `+1`,
SUM(`2011/06`+`2011/07`=2) `+2`,SUM(`2011/06`+`2011/07`+`2011/08`=3) `+3`,
SUM(`2011/06`+`2011/07`+`2011/08`+`2011/09`=4)`+4`,SUM(`2011/06`+`2011/07`+`2011/08`+`2011/09`+`2011/10`=5)`+5`
FROM customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=`2011/04`=0 AND `2011/05`=1
UNION
SELECT '2011/06' AS`月份`,COUNT(*) AS `新增`,SUM(`2011/07`=1) `+1`,
SUM(`2011/07`+`2011/08`=2) `+2`,SUM(`2011/07`+`2011/08`+`2011/09`=3) `+3`,
SUM(`2011/07`+`2011/08`+`2011/09`+`2011/10`=4)`+4`,SUM(`2011/07`+`2011/08`+`2011/09`+`2011/10`+`2011/11`=5)`+5`
FROM customer_orders WHERE `2010/12`=`2011/01`=`2011/02`=`2011/03`=`2011/04`=`2011/05`=0 AND `2011/06`=1
SELECT * FROM retention;
三、复购率
购买次数及其人数
SELECT times `购买次数`,COUNT(*) `人数`
FROM
(
SELECT customerid,COUNT(DISTINCT invoiceno) times
FROM uk2 GROUP BY customerid
) temp
GROUP BY times ORDER BY times;
复购率
SELECT ROUND(COUNT(customerid)/ (SELECT COUNT(DISTINCT customerid) FROM uk2),2) `复购率`
FROM
(SELECT customerid FROM uk2 GROUP BY customerid
HAVING COUNT(DISTINCT invoiceno)>1) fugou
复购的间隔
CREATE TABLE fugou AS # 生成复购表
SELECT temp.*,@num:=IF(customerid<>@c,1,IF(mydate>@date,@num+1,@num)) num,
@c:=customerid,@date:=mydate
FROM (
SELECT uk2.customerid,DATE(invoicedate) mydate
FROM uk2 JOIN (
SELECT customerid FROM uk2
GROUP BY customerid
HAVING COUNT(DISTINCT DATE(invoicedate))>1 # 一张发票也许对应多个时间,但只要在一天就视为一个订单
)fugou
ON uk2.`CustomerID`=fugou.customerid
ORDER BY uk2.customerid,mydate
) temp,(SELECT @num:=0,@c:=' ',@date:='2000-01-01') t
GROUP BY customerid,mydate # 每个顾客每个订单上有多个产品,需要分组
ORDER BY customerid,mydate;
SELECT * FROM fugou;
每个消费者的平均复购时间间隔
SELECT f1.customerid,MAX(f1.num) `下单次数`,1*AVG(DATEDIFF(f1.mydate,f2.mydate)) `平均间隔天数`
FROM fugou f1,fugou f2
WHERE f1.customerid=f2.customerid
AND f1.num-f2.num=1 # 本次,上一次
GROUP BY f1.customerid
ORDER BY `下单次数` DESC,`平均间隔天数` DESC
新客户首日购买后,第二天(n日内)依旧购买的比例
SELECT ROUND(COUNT(DISTINCT uk2.customerid)/(SELECT COUNT(DISTINCT customerid) FROM uk2),2) `第二天回购比例`
FROM uk2 JOIN (SELECT customerid,MIN(invoicedate) first_buy FROM uk2 GROUP BY customerid) t
ON uk2.`CustomerID`=t.customerid
AND DATEDIFF(uk2.`InvoiceDate`,first_buy)=1 # <n
# DATEDIFF(uk2.`InvoiceDate`,first_buy)<30 AND uk2.`InvoiceDate`<>first_buy 30天内回购比例:0.22
2010年12月每日新用户数量,及其在一个月内的复购人数、复购比例
SELECT fd,COUNT(DISTINCT t.customerid) `新用户数`,COUNT(DISTINCT uk2.`CustomerID`) `30天内复购人数`,
ROUND(COUNT(DISTINCT uk2.`CustomerID`)/COUNT(DISTINCT t.customerid),2)`30天内复购比例`
FROM
uk2 RIGHT JOIN
(
SELECT customerid,MIN(DATE(invoicedate)) fd
FROM uk2 WHERE YEAR(invoicedate)=2010 GROUP BY customerid
) t
ON uk2.`CustomerID`=t.customerid
AND DATEDIFF(DATE(uk2.`InvoiceDate`),fd)<=30 AND DATE(uk2.`InvoiceDate`)<>fd
GROUP BY fd;
# 第二个and条件必须使用DATE函数变为日期,t表中是日期类型的
新老顾客标签
CREATE TABLE customer_type AS
SELECT uk2.customerid,DATE(uk2.`InvoiceDate`) d,
(CASE
WHEN fugou.`customerid` IS NULL THEN '新客户'
WHEN fugou.num=1 THEN '新客户'
ELSE '老客户' END) `客户类型`
FROM uk2 LEFT JOIN fugou ON uk2.`CustomerID`=fugou.`customerid`
AND DATE(uk2.`InvoiceDate`)=fugou.`mydate`;
SELECT DATE_FORMAT(d,'%Y-%m') `月份`,`客户类型`,COUNT(*) n
FROM customer_type
GROUP BY DATE_FORMAT(d,'%Y-%m'),`客户类型`
四、RFM模型
# 设现在时间的2011-12-09
CREATE VIEW RFM AS
SELECT customerid,DATEDIFF('2011-12-09',MAX(invoicedate)) R,
COUNT(DISTINCT invoiceno) F,
ROUND(SUM(quantity*unitprice),2) M
FROM uk2
GROUP BY customerid;
SELECT * FROM RFM LIMIT 5
根据均值划分高低两类,客户标签可能不准确。
SELECT *,CASE
WHEN R_score=1 AND F_score=1 AND M_score=1 THEN '重要价值客户'
WHEN R_score=1 AND F_score=1 AND M_score=0 THEN '重要保持客户'
WHEN R_score=0 AND F_score=1 AND M_score=1 THEN '重要挽留客户'
WHEN R_score=1 AND F_score=0 AND M_score=1 THEN '重要价值客户'
WHEN R_score=1 AND F_score=1 AND M_score=0 THEN '一般价值客户'
WHEN R_score=1 AND F_score=0 AND M_score=0 THEN '一般保持客户'
WHEN R_score=0 AND F_score=1 AND M_score=0 THEN '一般客户'
ELSE '流失客户' END `客户类型`
FROM
(
SELECT *,IF(R>(SELECT AVG(R) FROM RFM),0,1) R_score,
IF(F>(SELECT AVG(F) FROM RFM),1,0) F_score,
IF(M>(SELECT AVG(M) FROM RFM),1,0) M_score
FROM RFM
) temp;