• hive电子商务消费行为分析


    hive电子商务消费行为分析

    1. 掌握Zeppelin的使用

    2. 了解数据结构

    3.数据清洗

    4. 基于Hive的数据分析

    1.物料准备

    (1)Customer表

    customer_details

    details

    customer_id

    Int, 1 - 500

    first_name

    string

    last_name

    string

    email

    string, such as willddy@gmail.com

    gender

    string, Male or female

    address

    string

    country

    string

    language

    string

    job

    string, job title/position

    credit_type

    string, credit card type, such as visa

    credit_no

    string, credit card number

    (2)Store表

    store_details

    details

    store_id

    Int, 1 - 5

    store_name

    string

    employee_number

    Int, 在store有多少employee

    (3)Review表

    store_review

    details

    stransaction_id

    Int, 1 - 8000

    store_id

    Int, 1 - 5

    review_store

    Int, 1 - 5

    (4)Transaction表

    transaction_details

    details

    transaction_id

    Int, 1 - 1000

    customer_id

    Int, 1 - 500

    store_id

    Int, 1 - 5

    price

    decimal, such as 5.08

    product

    string, things bought

    date

    string, when to purchase

    time

    string, what time to purchase

    2. 实现数据清洗

    2.1  找出PII (personal information identification) 或PCI (personal confidential information) 数据进行加密或hash

    1. --数据加密处理
    2. select * from ext_customer_details;
    3. create view if not exists vm_customer_details as
    4. select
    5. customer_id,
    6. first_name,
    7. unbase64(last_name) as last_name,
    8. unbase64(email) as email,
    9. gender,
    10. unbase64(address) as address,
    11. country,
    12. language,
    13. job,
    14. credit_type,
    15. unbase64(concat(unbase64(credit_no),'kb23')) as credit_no
    16. from ext_customer_details;
    17. show views ;
    18. show tables ;
    19. select * from vw_customer_details;

    2.2  重新组织transaction数据按照日期YYYY-MM做分区,并对transaction_details中的重复数据生成新ID

    1. --交易详情表 按月创建分区表
    2. create table if not exists transaction_details (
    3. transaction_id string,
    4. customer_id string,
    5. store_id string,
    6. price decimal(8,2),
    7. product string,
    8. purchase_date string,
    9. purchase_time string
    10. )
    11. partitioned by (purchase_month string);
    12. ------ext_transaction_details 重复数据生成新的id-------------
    13. --查询重复的数据
    14. with
    15. base as ( select row_number() over (partition by transaction_id order by 1) as rn, * from ext_transaction_details )
    16. select * from base where rn > 1;
    17. --打开动态分区
    18. set hive.exec.dynamic.partition=true;
    19. set hive.exec.dynamic.partition.mode=nonstrict;
    20. --重复id处理方式 transaction_id_fix_rn --> 123456_fix_2
    21. with
    22. base as (
    23. select
    24. transaction_id,
    25. customer_id,
    26. store_id,
    27. price,
    28. product,
    29. purchase_date,
    30. purchase_time,
    31. from_unixtime(unix_timestamp(purchase_date, 'yyyy-MM-dd'),'yyyy-MM') as purchase_month,
    32. row_number() over (partition by transaction_id order by store_id) as rn
    33. from ext_transaction_details)
    34. insert overwrite table transaction_details partition (purchase_month)
    35. select `if`(rn=1,transaction_id, concat(transaction_id, '_fix_', rn)), customer_id, store_id,
    36. price, product, purchase_date, purchase_time,purchase_month from base;
    37. show partitions transaction_details;

    2.3  过滤掉store_review中没有评分的数据

    %hive

    use shopping;

    -- select * from ext_store_review where review_score == '';

    create view vw_store_review as(

    select transaction_id, store_id, review_score from ext_store_review where review_score <> '');

    3. Customer分析(zeppelin和datagrip查询)

    3.1 找出顾客最常用的信用卡

    %hive

    use shopping;

    select

        country,

        credit_type,

        count(distinct credit_no) as credit_cnt

    from vm_customer_details

    group by country,credit_type

    order by credit_cnt desc;

    3.2 找出客户资料中排名前五的职位名称

    %hive

    use shopping;

    select job, count(1) as pn

    from vw_customer_details

    group by job

    order by pn desc limit 10;

    3.3 在美国女性最常用的信用卡

    %hive

    use shopping;

    select

        credit_type,

        count(1) as credit_cnt

    from vw_customer_details

    where country='United States' and gender = 'Female'

    group by credit_type

    order by credit_cnt desc limit 3;

    3.4 按性别和国家进行客户统计

    %hive

    use shopping;

    select

        country,

        gender,

        count(1) as cn

    from vw_customer_details

    group by country,gender

    order by cn desc limit 3;

    4. Transaction分析

    4.1计算每月总收入

    4.2计算每个季度的总收入

    4.3按年计算总收入

    4.4按工作日计算总收入

    4.5按时间段计算总收入(需要清理数据)

    4.6按时间段计算平均消费

    4.4按工作日计算平均消费

    4.8计算年、月、日的交易总数

    4.9找出交易量最大的10个客户

    4.10找出消费最多的前10位顾客

    1. -- 1计算每月总收入
    2. select
    3. purchase_month, sum(price) as monthSUM
    4. from transaction_details
    5. group by purchase_month;
     
    1. -- 2计算每季度总收入
    2. with
    3. base as ( select
    4. price,
    5. concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as string))as year_quarter
    6. from transaction_details)
    7. select year_quarter, sum(price) as year_quarterSUM from base group by year_quarter;
    8. -- 3计算每年度总收入
    9. select
    10. substr(purchase_date,1,4),
    11. sum(price) as yearSUM
    12. from transaction_details
    13. group by substr(purchase_date,1,4);
    14. -- 4计算工作日收入
    15. select
    16. `dayofweek`(purchase_date),
    17. sum(price) as daySUM
    18. from transaction_details
    19. group by `dayofweek`(purchase_date)
    20. having `dayofweek`(purchase_date) in (2,3,4,5,6);
    21. -- 5按时间段计算总收入(需要清理数据)
    22. with
    23. base as (
    24. select price,purchase_time,
    25. if(purchase_time like '%M',from_unixtime(unix_timestamp(purchase_time,'hh:mm aa'),'HH:mm'),purchase_time) as time_format
    26. from transaction_details),
    27. timeformat as (
    28. select price, purchase_time, time_format, (cast(split(time_format,':')[0] as decimal(4,2))+ cast(split(time_format,':')[1] as decimal(4,2))/60) as purchase_time_in_hr from base),
    29. t1 as (select price, purchase_time, time_format, purchase_time_in_hr,
    30. `if`(purchase_time_in_hr > 5 and purchase_time_in_hr <= 8, 'early morning',
    31. `if`(purchase_time_in_hr > 8 and purchase_time_in_hr <= 11, 'morning',
    32. `if`(purchase_time_in_hr > 11 and purchase_time_in_hr <= 13, 'noon',
    33. `if`(purchase_time_in_hr > 13 and purchase_time_in_hr <= 18, 'afternoon',
    34. `if`(purchase_time_in_hr > 18 and purchase_time_in_hr <= 22, 'evening', 'night'))))) as time_bucket
    35. from timeformat)
    36. select time_bucket, sum(price) bucketSUM, avg(price) avgprice from t1 group by time_bucket ;
     
    1. -- 6按时间段计算平均消费
    2. -- 7按工作日计算平均消费
    3. select
    4. `dayofweek`(purchase_date),
    5. avg(price) as dayAVG
    6. from transaction_details
    7. where `dayofweek`(purchase_date) in (2,3,4,5,6)
    8. group by `dayofweek`(purchase_date);
    9. -- 8计算年、月、日的交易总数
    10. select
    11. year(purchase_date) currentYear,
    12. count(1) as sumCount
    13. from transaction_details
    14. group by year(purchase_date);
    15. select
    16. substring(purchase_date,1,7) currentMonth,
    17. count(1) as sumCount
    18. from transaction_details
    19. group by substring(purchase_date,1,7);
    20. select
    21. day(purchase_date) currentDay,
    22. count(1) as sumCount
    23. from transaction_details
    24. group by day(purchase_date);
    25. -- 9找出交易量最大的10个客户
    26. with
    27. base as (
    28. select customer_id,count(transaction_id) as trans_cnt,sum(price) as customerSUM from transaction_details group by customer_id),
    29. cust_detail as(
    30. select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,
    31. dense_rank() over (order by trans_cnt desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)
    32. select * from cust_detail where rn<=10;
    33. -- 10找出消费最多的前10位顾客
    34. with
    35. base as (
    36. select customer_id,sum(price) as customerSUM from transaction_details group by customer_id),
    37. cust_detail as(
    38. select concat_ws(' ', cd.first_name, '***') as cust_name, base.* ,
    39. dense_rank() over (order by customerSUM desc ) rn from base join vw_customer_details cd on base.customer_id=cd.customer_id)
    40. select * from cust_detail where rn<=10;
     

    Mysql中建表查询

    7.11统计该期间交易数量最少的用户

    7.12计算每个季度的独立客户总数

    7.13计算每周的独立客户总数

    7.14计算整个活动客户平均花费的最大值

    7.15统计每月花费最多的客户

    7.16统计每月访问次数最多的客户

    7.17按总价找出最受欢迎的5种产品

    7.18根据购买频率找出最畅销的5种产品

    7.19根据客户数量找出最受欢迎的5种产品

    1. -- 7.11统计该期间交易数量最少的用户
    2. WITH
    3. t1 AS(SELECT customer_id,COUNT(1) c FROM transaction_details GROUP BY customer_id ORDER BY c),
    4. t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY t1.c) as rn FROM t1)
    5. SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
    6. -- 7.12计算每个季度的独立客户总数
    7. SELECT DISTINCT COUNT(customer_id),
    8. concat_ws('-',substr(purchase_date,1,4),cast(ceil(month(purchase_date)/3.0) as CHAR(20)))as year_quarter
    9. from transaction_details
    10. GROUP BY year_quarter
    11. ORDER BY (year_quarter+0);
    12. -- 7.13计算每周的独立客户总数
    13. SELECT DISTINCT COUNT(customer_id),
    14. concat_ws('-',substr(purchase_date,1,4),cast(WEEKOFYEAR(purchase_date) as CHAR(20)))as week_year
    15. from transaction_details
    16. GROUP BY week_year;
    17. -- 7.14计算整个活动客户平均花费的最大值
    18. WITH
    19. t1 AS(SELECT customer_id, avg(price) as costAVG FROM transaction_details GROUP BY customer_id),
    20. t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY costAVG DESC) as rn FROM t1)
    21. SELECT * FROM t2 WHERE rn=1;
    22. -- 7.15统计每月花费最多的客户
    23. WITH
    24. t1 AS(SELECT purchase_month,customer_id, sum(price) as costSUM FROM transaction_details GROUP BY purchase_month,customer_id),
    25. t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costSUM DESC) as rn FROM t1)
    26. SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
    27. -- 7.16统计每月访问次数最多的客户
    28. WITH
    29. t1 AS(SELECT purchase_month,customer_id, count(customer_id) as costCount FROM transaction_details GROUP BY purchase_month,customer_id),
    30. t2 AS(SELECT t1.*,DENSE_RANK() over(PARTITION by purchase_month ORDER BY costCount DESC) as rn FROM t1)
    31. SELECT CONCAT(vcd.first_name,'***'), t2.* FROM t2 JOIN vw_customer_details vcd ON t2.customer_id=vcd.customer_id WHERE rn=1;
    32. -- 7.17按总价找出最受欢迎的5种产品
    33. WITH
    34. t1 AS(SELECT product,sum(price) as proSUM FROM transaction_details GROUP BY product),
    35. t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proSUM DESC) as rn FROM t1)
    36. SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;
    37. -- 7.18根据购买频率找出最畅销的5种产品
    38. WITH
    39. t1 AS(SELECT product,COUNT(product) as proCount FROM transaction_details GROUP BY product),
    40. t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)
    41. SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;
    42. -- 7.19根据客户数量找出最受欢迎的5种产品
    43. WITH
    44. t1 AS(SELECT product,COUNT(customer_id) as proCount FROM transaction_details GROUP BY product),
    45. t2 AS(SELECT t1.*,DENSE_RANK() over(ORDER BY proCount DESC) as rn FROM t1)
    46. SELECT * FROM t2 WHERE rn BETWEEN 1 AND 5;

  • 相关阅读:
    ETL与ELT理解
    4-7再谈方法之方法参数的值传递(练习)
    手把手教你配置Linux深度学习服务器-不用本地安装!
    TCP/IP协议详解
    腾讯:《智能科技 跨界相变——2024数字科技前沿应用趋势》
    STL——vector
    小程序的破局之道,数字化营销已然成为趋势
    SpringBoot集成easypoi实现execl导出
    Linux介绍
    ORB-SLAM2 ---- Initializer::ReconstructF函数
  • 原文地址:https://blog.csdn.net/weixin_63713552/article/details/132887997