hive电子商务消费行为分析

1. 掌握Zeppelin的使用

2. 了解数据结构

3．数据清洗

4. 基于Hive的数据分析

1.物料准备

（1）Customer表

customer_details	details
customer_id	Int, 1 - 500
first_name	string
last_name	string
email	string, such as willddy@gmail.com
gender	string, Male or female
address	string
country	string
language	string
job	string, job title/position
credit_type	string, credit card type, such as visa
credit_no	string, credit card number

（2）Store表

store_details	details
store_id	Int, 1 - 5
store_name	string
employee_number	Int, 在store有多少employee

（3）Review表

store_review	details
stransaction_id	Int, 1 - 8000
store_id	Int, 1 - 5
review_store	Int, 1 - 5

（4）Transaction表

transaction_details	details
transaction_id	Int, 1 - 1000
customer_id	Int, 1 - 500
store_id	Int, 1 - 5
price	decimal, such as 5.08
product	string, things bought
date	string, when to purchase
time	string, what time to purchase

2. 实现数据清洗

2.1 找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash


--数据加密处理
select * from ext_customer_details;
create view if not exists vm_customer_details as
select
    customer_id,
    first_name,
    unbase64(last_name) as last_name,
    unbase64(email) as email,
    gender,
    unbase64(address) as address,
    country,
    language,
    job,
    credit_type,
    unbase64(concat(unbase64(credit_no),'kb23')) as credit_no
from ext_customer_details;
 
show views ;
show tables ;
select * from vw_customer_details;

2.2 重新组织transaction数据按照日期YYYY-MM做分区，并对transaction_details中的重复数据生成新ID


--交易详情表  按月创建分区表
 
  create table if not exists transaction_details (
 
    transaction_id string,
 
    customer_id string,
 
    store_id string,
 
    price decimal(8,2),
 
    product string,
 
    purchase_date string,
 
    purchase_time string
 
  )
 
  partitioned by (purchase_month string);
 
 
------ext_transaction_details  重复数据生成新的id-------------
 
--查询重复的数据
 
  with
 
    base as ( select row_number() over (partition by transaction_id order by 1) as rn, * from ext_transaction_details )
 
  select * from base where rn > 1;
 
  
 
  --打开动态分区
 
  set hive.exec.dynamic.partition=true;
 
  set hive.exec.dynamic.partition.mode=nonstrict;
 
  
 
  --重复id处理方式  transaction_id_fix_rn  --> 123456_fix_2
 
  with
 
    base as (
 
        select
 
            transaction_id,
 
            customer_id,
 
            store_id,
 
            price,
 
            product,
 
            purchase_date,
 
            purchase_time,
 
            from_unixtime(unix_timestamp(purchase_date, 'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
 
            row_number() over (partition by transaction_id order by store_id) as rn
 
        from ext_transaction_details)
 
  insert overwrite table transaction_details partition (purchase_month)
 
  select `if`(rn=1,transaction_id, concat(transaction_id, '_fix_', rn)), customer_id, store_id,
 
            price, product, purchase_date, purchase_time,purchase_month from base;
 
  
 
  show partitions transaction_details;

2.3 过滤掉store_review中没有评分的数据

%hive

use shopping;

-- select * from ext_store_review where review_score == '';

create view vw_store_review as(

select transaction_id, store_id, review_score from ext_store_review where review_score <> '');

3. Customer分析（zeppelin和datagrip查询）

3.1 找出顾客最常用的信用卡

%hive

use shopping;

select

country,

credit_type,

count(distinct credit_no) as credit_cnt

from vm_customer_details

group by country,credit_type

order by credit_cnt desc;

3.2 找出客户资料中排名前五的职位名称

%hive

use shopping;

select job, count(1) as pn

from vw_customer_details

group by job

order by pn desc limit 10;

3.3 在美国女性最常用的信用卡

%hive

use shopping;

select

credit_type,

count(1) as credit_cnt

from vw_customer_details

where country='United States' and gender = 'Female'

group by credit_type

order by credit_cnt desc limit 3;

3.4 按性别和国家进行客户统计

%hive

use shopping;

select

country,

gender,

count(1) as cn

from vw_customer_details

group by country,gender

order by cn desc limit 3;

4. Transaction分析

4.1计算每月总收入

4.2计算每个季度的总收入

4.3按年计算总收入

4.4按工作日计算总收入

4.5按时间段计算总收入(需要清理数据)

4.6按时间段计算平均消费

4.4按工作日计算平均消费

4.8计算年、月、日的交易总数

4.9找出交易量最大的10个客户

4.10找出消费最多的前10位顾客


-- 1计算每月总收入
 
  select
 
    purchase_month, sum(price) as monthSUM
 
  from transaction_details
 
  group by purchase_month;


 
  -- 2计算每季度总收入
 
  with
 
    base as ( select
 
    price,
 
    concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as string))as year_quarter
 
  from transaction_details)
 
  select  year_quarter, sum(price) as year_quarterSUM from base group by year_quarter;
 
  
 
  -- 3计算每年度总收入
 
  select
 
    substr(purchase_date,1,4),
 
    sum(price) as yearSUM
 
  from transaction_details
 
  group by substr(purchase_date,1,4);
 
  
 
  
 
 
 
  -- 4计算工作日收入
 
  select
 
    `dayofweek`(purchase_date),
 
    sum(price) as daySUM
 
  from transaction_details
 
  group by `dayofweek`(purchase_date)
 
  having `dayofweek`(purchase_date) in (2,3,4,5,6);
 
  
 
  -- 5按时间段计算总收入(需要清理数据)
 
  with
 
    base as (
 
        select price,purchase_time,
 
        if(purchase_time like '%M',from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'),purchase_time) as time_format
 
        from transaction_details),
 
    timeformat as (
 
  select price, purchase_time, time_format, (cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60) as purchase_time_in_hr from base),
 
t1 as (select price, purchase_time, time_format, purchase_time_in_hr,
 
    `if`(purchase_time_in_hr > 5 and purchase_time_in_hr <= 8, 'early morning',
 
    `if`(purchase_time_in_hr > 8 and purchase_time_in_hr <= 11, 'morning',
 
    `if`(purchase_time_in_hr > 11 and purchase_time_in_hr <= 13, 'noon',
 
    `if`(purchase_time_in_hr > 13 and purchase_time_in_hr <= 18, 'afternoon',
 
    `if`(purchase_time_in_hr > 18 and purchase_time_in_hr <= 22, 'evening', 'night'))))) as time_bucket
 
  from timeformat)
 
  select time_bucket, sum(price) bucketSUM, avg(price) avgprice from t1 group by time_bucket ;


   
 
  
 
  -- 6按时间段计算平均消费
 
  
 
  
 
  -- 7按工作日计算平均消费
 
  select
 
    `dayofweek`(purchase_date),
 
    avg(price) as dayAVG
 
  from transaction_details
 
  where `dayofweek`(purchase_date) in (2,3,4,5,6)
 
  group by `dayofweek`(purchase_date);
 
  
 
  -- 8计算年、月、日的交易总数
 
  select
 
    year(purchase_date) currentYear,
 
    count(1) as sumCount
 
  from transaction_details
 
  group by year(purchase_date);
 
  
 
  select
 
    substring(purchase_date,1,7) currentMonth,
 
    count(1) as sumCount
 
  from transaction_details
 
  group by substring(purchase_date,1,7);
 
  
 
  select
 
    day(purchase_date) currentDay,
 
    count(1) as sumCount
 
  from transaction_details
 
  group by day(purchase_date);
 
  
 
  -- 9找出交易量最大的10个客户
 
  with
 
    base as (
 
    select customer_id,count(transaction_id) as trans_cnt,sum(price) as customerSUM from transaction_details group by customer_id),
 
    cust_detail as(
 
    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,
 
       dense_rank() over (order by trans_cnt desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)
 
  select * from cust_detail where rn<=10;
 
  
 
  
 
  -- 10找出消费最多的前10位顾客
 
  with
 
    base as (
 
    select customer_id,sum(price) as customerSUM from transaction_details group by customer_id),
 
    cust_detail as(
 
    select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,
 
       dense_rank() over (order by customerSUM desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)
 
  select * from cust_detail where rn<=10;

Mysql中建表查询

7.11统计该期间交易数量最少的用户

7.12计算每个季度的独立客户总数

7.13计算每周的独立客户总数

7.14计算整个活动客户平均花费的最大值

7.15统计每月花费最多的客户

7.16统计每月访问次数最多的客户

7.17按总价找出最受欢迎的5种产品

7.18根据购买频率找出最畅销的5种产品

7.19根据客户数量找出最受欢迎的5种产品


-- 7.11统计该期间交易数量最少的用户
 
WITH
 
       t1 AS(SELECT customer_id,COUNT(1) c FROM transaction_details GROUP BY customer_id ORDER BY c),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY t1.c) as rn FROM t1)
 
SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
 
 
 
-- 7.12计算每个季度的独立客户总数
 
SELECT DISTINCT COUNT(customer_id),
 
       concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as CHAR(20)))as year_quarter
 
from transaction_details
 
GROUP BY year_quarter
 
ORDER BY (year_quarter+0);
 
 
 
-- 7.13计算每周的独立客户总数
 
SELECT DISTINCT COUNT(customer_id),
 
       concat_ws('-',substr(purchase_date,1,4),cast(WEEKOFYEAR(purchase_date) as CHAR(20)))as week_year
 
from transaction_details
 
GROUP BY week_year;
 
 
 
-- 7.14计算整个活动客户平均花费的最大值
 
WITH
 
       t1 AS(SELECT customer_id, avg(price) as costAVG FROM transaction_details GROUP BY customer_id),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY costAVG DESC) as rn FROM t1)
 
SELECT * FROM t2 WHERE rn=1;
 
 
 
-- 7.15统计每月花费最多的客户
 
WITH
 
       t1 AS(SELECT purchase_month,customer_id, sum(price) as costSUM FROM transaction_details GROUP BY purchase_month,customer_id),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costSUM DESC) as rn FROM t1)
 
SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
 
 
-- 7.16统计每月访问次数最多的客户
 
WITH
 
       t1 AS(SELECT purchase_month,customer_id, count(customer_id) as costCount FROM transaction_details GROUP BY purchase_month,customer_id),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costCount DESC) as rn FROM t1)
 
SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
 
 
 
-- 7.17按总价找出最受欢迎的5种产品
 
WITH
 
       t1 AS(SELECT product,sum(price) as proSUM FROM transaction_details GROUP BY product),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proSUM DESC) as rn FROM t1)
 
SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;
 
 
 
-- 7.18根据购买频率找出最畅销的5种产品
 
WITH
 
       t1 AS(SELECT product,COUNT(product) as proCount FROM transaction_details GROUP BY product),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)
 
SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;
 
 
 
-- 7.19根据客户数量找出最受欢迎的5种产品
 
WITH
 
       t1 AS(SELECT product,COUNT(customer_id) as proCount FROM transaction_details GROUP BY product),
 
       t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)
 
SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;

相关阅读:
ETL与ELT理解
 4-7再谈方法之方法参数的值传递（练习）
手把手教你配置Linux深度学习服务器-不用本地安装！
TCP/IP协议详解
 腾讯：《智能科技跨界相变——2024数字科技前沿应用趋势》
STL——vector
小程序的破局之道，数字化营销已然成为趋势
 SpringBoot集成easypoi实现execl导出
 Linux介绍
 ORB-SLAM2 ---- Initializer::ReconstructF函数
原文地址：https://blog.csdn.net/weixin_63713552/article/details/132887997