设计hive数据仓库进行数据分析
目录
5.创建明细表ods_jobdata_detail 用户细化薪资字段的数据
7.对薪资进行扁平化处理,.将处理结果存储到临时中间表t_ods_tmp_salary
8.对临时中间表t_ods_tmp_salary中的每一条数据进行泛化处理,将处理结果存储到中间表t_ods_tmp_salary_dist中
9.对福利标签字段内容进行扁平化处理,将处理的结果储存到临时中间t_ods_tmp_company
10.对技能标签字段内容进行扁平化处理,将处理的结果储存到临时中间表t_ods_tmp_kill
11.创建维度表t_ods_company ,t_ods_company,t_ods_kill ,t_ods _city用于存储福利标签.薪资分布..技能标签.城市统计的结果
4.查看维度表t_ods_salary 中的分析结果...使用sort by参数对表虫的count列进行倒叙排序
一、设计hive数据仓库
1.创建数据仓库
[root@hadoop1 ~]# systemctl start mariadb
[root@hadoop1 ~]# hive
create database jobdata;
hive> use jobdata;
2.创建事实表
CREATE TABLE ods_jobdata_origin(
city string COMMENT '城市',
salary array<String>COMMENT '薪资',
company array<String>COMMENT '福利',
kill array<String>COMMENT '技能')
COMMENT '原始职位数据表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
COLLECTION ITEMS TERMINATED BY '-'
STORED AS TEXTFILE;
3.加载数据到事实表
LOAD DATA INPATH '/JobData/output/part-r-00000' OVERWRITE INTO TABLE ods_jobdata_origin;
4.查询数据
select * from ods_jobdata_origin;
5.创建明细表ods_jobdata_detail 用户细化薪资字段的数据
create table ods_jobdata_detail(
city string comment '城市',
salary array<String>comment '薪资',
company array<String>comment '福利',
kill array<String>comment '技能',
low_salary int comment '低薪资',
high_salary int comment '高薪资',
avg_salary double comment '平均薪资')
COMMENT '职位数据明细表'
ROW FORMAT DELIMITED
FIELDS TERMINATED BY ','
STORED AS TEXTFILE;
6.加教数据到明细表
insert overwrite table ods_jobdata_detail
select city,salary,company,kill,salary[0],salary[1],
(salary[0]+salary[1])/2 from ods_jobdata_origin;
7.对薪资进行扁平化处理,.将处理结果存储到临时中间表t_ods_tmp_salary
create table t_ods_tmp_salary as select explode(ojo.salary) from ods_jobdata_origin ojo;
8.对临时中间表t_ods_tmp_salary中的每一条数据进行泛化处理,将处理结果存储到中间表t_ods_tmp_salary_dist中
create table t_ods_tmp_salary_dist as
select case when col>=0 and col<=5 then "0-5"
when col>=6 and col<=10 then "6-10"
when col>=11 and col<=15 then "11-15"
when col>=16 and col<=20 then "16-20"
when col>=21 and col<=25 then "21-25"
when col>=26 and col<=30 then "26-30"
when col>=31 and col<=35 then "31-35"
when col>=36 and col<=40 then "36-40"
when col>=41 and col<=45 then "41-45"
when col>=46 and col<=50 then "46-50"
when col>=51 and col<=55 then "51-55"
when col>=56 and col<=60 then "56-60"
when col>=61 and col<=65 then "61-65"
when col>=66 and col<=70 then "66-70"
when col>=71 and col<=75 then "71-75"
when col>=76 and col<=80 then "76-80"
when col>=81 and col<=85 then "81-85"
when col>=86 and col<=90 then "86-90"
when col>=91 and col<=95 then "91-95"
when col>=96 and col<=100 then "96-100"
when col >= 101 then ">101" end from t_ods_tmp_salary;
9.对福利标签字段内容进行扁平化处理,将处理的结果储存到临时中间t_ods_tmp_company
create table t_ods_tmp_company as select explode (ojo.company) from ods_jobdata_origin ojo;
10.对技能标签字段内容进行扁平化处理,将处理的结果储存到临时中间表t_ods_tmp_kill
create table t_ods_tmp_kill as select explode (ojo.kill) from ods_jobdata_origin ojo;
11.创建维度表t_ods_company ,t_ods_company,t_ods_kill ,t_ods _city用于存储福利标签.薪资分布..技能标签.城市统计的结果
create table t_ods_kill(
every_kill String comment '技能标签',
count int comment '词频')
COMMENT '技能标签词频统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_company(
every_company String comment '福利标签',
count int comment '词频')
COMMENT '福利标签词频统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_salary(
every_partition String comment '薪资分布',
count int comment '聚合统计')
COMMENT '薪资分布聚合统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
create table t_ods_city(
every_city String comment '城市',
count int comment '词频')
COMMENT '城市统计'
ROW FORMAT DELIMITED
fields terminated by ','
STORED AS TEXTFILE;
二、分析数据
1.职位区域分析
insert overwrite table t_ods_city
select city,count(1) from ods_jobdata_origin group by city;
2.倒叙查询职位区域的信息
select * from t_ods_city sort by count desc;
3.职位薪资分析
//查看表结构
hive> desc t_ods_tmp_salary_dist;
OK
c0 stringTime taken: 0.052 seconds, Fetched: 1 row(s)
insert overwrite table t_ods_salary
select `c0`,count(1) from t_ods_tmp_salary_dist group by `c0`;
4.查看维度表t_ods_salary 中的分析结果...使用sort by参数对表虫的count列进行倒叙排序
select * from t_ods_salary sort by count desc;
5.平均值
select avg(avg_salary) from ods_jobdata_detail;
6.众数
select avg_salary,count(1) as cnt from ods_jobdata_detail group by avg_salary order by cnt desc limit 1;
7.中位数
select percentile(cast(avg_salary as BIGINT),0.5) from ods_jobdata_detail;
8.各城市平均薪资待遇
select city,count(city),round(avg(avg_salary),2) as cnt from ods_jobdata_detail group by city order by cnt desc;
9.公司福利分析
insert overwrite table t_ods_company select col,count(1) from t_ods_tmp_company group by col;
10.查询维度表中的分析结果,倒叙,查询前10个
select every_company,count from t_ods_company sort by count desc limit 10;
11.职位技能要求分析
insert overwrite table t_ods_kill select col,count(1) from t_ods_tmp_kill group by col;
12.查看技能维度表中的分析结果,倒叙,查看前3个
select every_kill,count from t_ods_kill sort by count desc limit 3;