用MySQL对数据进行分析

分析目标:
根据某电商近期的销售记录,对每月的购买人数、复购率、回购率、消费时间、消费群体等进行分析。

数据来源
orderinfo表:

create table orderinfo(
orderid varchar(10) primary key,
userid varchar(10),
ispaid varchar(10),
prince varchar(20),
paidtime varchar(50));

userinfo表:

create table userinfo(
userid varchar(10) primary key,
sex char(2),
brith varchar(50));

导入数据:
本处用到MySQL自带的LOAD DATA INFILE的函数。对比了python的execute/executemany,LOAD DATA INFILE的速度真是非常快,谁用谁知道!
ps:如果用LOAD DATA INFILE,提示禁止加载本地数据;这必须在客户端和服务器端都启用。
那可能是加载本地文件没有开启,输入查看命令:show variables like 'local_infile';
然后开启:set global local_infile=on;
OK!

LOAD DATA LOCAL INFILE 'order_info.csv' into table  orderinfo  fields terminated by ',';
LOAD DATA LOCAL INFILE 'user_info.csv' into table  orderinfo  fields terminated by ',';

处理数据
由于orderinfo表中的paidtime与userinfo表中的brith是字符串类型,需要转换成日期类型,分别用了两种方法实现。

ALTER TABLE orderinfo CHANGE paidtime paidtime datetime;
UPDATE userinfo SET brith = CAST( brith AS date );

分析问题
统计各个月份的下单人数

SELECT DATE_FORMAT(paidtime,'%Y-%m') as '日期',
	count( DISTINCT userid ) as '用户数量'
FROM
	orderinfo 
WHERE
	ispaid = '已支付' 
GROUP BY
	MONTH ( paidtime );

在这里插入图片描述

统计用户三月份的复购率和回购率

#复购率是在本月消费中多少人消费一次以上的占比
SELECT
	COUNT( userid ) AS '购买总人数',
	SUM( a ) AS '复购总人数',
	COUNT( userid )/ SUM( a ) AS '活跃率' 
FROM
	( SELECT userid, IF ( count(*)> 1, 1, 0 ) AS a FROM orderinfo WHERE MONTH ( paidtime )= 3 GROUP BY userid ) AS t;

在这里插入图片描述

#回购率是三月份购买的人数在四月份依旧购买的占比
SELECT
	COUNT( DISTINCT t1.userid ) AS '三月购买总人数',
	COUNT( DISTINCT t2.userid ) AS '四月回购人数',
	COUNT( DISTINCT t2.userid )/ COUNT( DISTINCT t1.userid ) AS '回购率' 
FROM
	( SELECT DISTINCT userid FROM orderinfo WHERE MONTH ( paidtime )= 3 ) AS t1
	LEFT JOIN ( SELECT DISTINCT userid FROM orderinfo WHERE MONTH ( paidtime )= 4 ) AS t2 
	ON t1.userid = t2.userid;

在这里插入图片描述

统计男女的消费频次是否有差异

SELECT
	COUNT( userid )/ COUNT( DISTINCT userid ) AS '总消费频次',
	COUNT( M )/ COUNT( DISTINCT M ) AS '男性消费频次',
	COUNT( F )/ COUNT( DISTINCT F ) AS '女性消费频次' 
FROM
	(
	SELECT
		t1.orderid,
		t1.userid,
	IF
		( sex = '男', t1.userid, NULL ) AS M,
	IF
		( sex = '女', t1.userid, NULL ) AS F 
	FROM
		orderinfo AS t1
		INNER JOIN userinfo AS t2 ON t1.userid = t2.userid 
	WHERE
		t2.sex IS NOT NULL 
	) t3;

在这里插入图片描述

统计多次消费的用户,第一次和最后一次消费时间的间隔

SELECT
	userid,
	TIMESTAMPDIFF(
		HOUR,
		min( paidtime ),
	max( paidtime )) '时间间隔(小时)' 
FROM
	orderinfo 
WHERE
	paidtime IS NOT NULL 
GROUP BY
	userid 
HAVING
	COUNT( orderid )> 1 
ORDER BY
	时间间隔(小时);

在这里插入图片描述

统计不同年龄段的用户消费金额是否有差异

SELECT
	t2.age as '年龄段',
	AVG( t1.prince ) as '平均消费'
FROM
	orderinfo AS t1
	INNER JOIN (
	SELECT
		userid,
CASE
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 10 and 19 THEN
		'10-19岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 20 and 29 THEN
		'20-29岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 30 and 39 THEN
		'30-39岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 40 and 49 THEN
		'40-49岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 50 and 59 THEN
		'50-59岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 60 and 69 THEN
		'60-69岁' 
		WHEN TIMESTAMPDIFF( YEAR, brith, '2016-01-01' ) BETWEEN 70 and 79 THEN
		'70-79岁' 
		ELSE NULL 
	END 
	AS age FROM userinfo WHERE brith IS NOT NULL ) AS t2 
ON t1.userid = t2.userid
WHERE t1.ispaid = '已支付' 	AND t2.age IS NOT NULL
GROUP BY age ORDER BY	age;

在这里插入图片描述

统计消费的二八法则,消费的top20%用户,贡献了多少额度

SELECT
	sum( t1.user_prince ) as 'top20%用户总消费'
FROM
	(
	SELECT
		userid,
		SUM( prince ) AS user_prince,
		row_number() over ( ORDER BY sum( prince ) DESC ) AS num 
	FROM
		orderinfo 
	WHERE
		ispaid = '已支付' 
	GROUP BY
		userid 
	) AS t1 
WHERE
	num <(
	SELECT
		count( DISTINCT userid )* 0.2 
	FROM
		orderinfo 
	WHERE
		ispaid = '已支付' 
	);

在这里插入图片描述