第一章 環境準備
1.1 用戶信息表
1)表結構
user_id(用戶id) | gender(性別) | birthday(生日) |
---|---|---|
101 | 男 | 1990-01-01 |
102 | 女 | 1991-02-01 |
103 | 女 | 1992-03-01 |
104 | 男 | 1993-04-01 |
2)建表語句
hive>
DROP TABLE IF EXISTS user_info;
create table user_info(
`user_id` string COMMENT '用戶id',
`gender` string COMMENT '性別',
`birthday` string COMMENT '生日'
) COMMENT '用戶信息表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table user_info
values ('101', '男', '1990-01-01'),
('102', '女', '1991-02-01'),
('103', '女', '1992-03-01'),
('104', '男', '1993-04-01'),
('105', '女', '1994-05-01'),
('106', '男', '1995-06-01'),
('107', '女', '1996-07-01'),
('108', '男', '1997-08-01'),
('109', '女', '1998-09-01'),
('1010', '男', '1999-10-01');
1.2 商品信息表
1)表結構
sku_id(商品id) | name(商品名稱) | category_id(分類id) | from_date(上架日期) | price(商品價格) |
---|---|---|---|---|
1 | xiaomi 10 | 1 | 2020-01-01 | 2000 |
6 | 洗碗機 | 2 | 2020-02-01 | 2000 |
9 | 自行車 | 3 | 2020-01-01 | 1000 |
2)建表語句
hive>
DROP TABLE IF EXISTS sku_info;
CREATE TABLE sku_info(
`sku_id` string COMMENT '商品id',
`name` string COMMENT '商品名稱',
`category_id` string COMMENT '所屬分類id',
`from_date` string COMMENT '上架日期',
`price` double COMMENT '商品單價'
) COMMENT '商品屬性表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table sku_info
values ('1', 'xiaomi 10', '1', '2020-01-01', 2000),
('2', '手機殼', '1', '2020-02-01', 10),
('3', 'apple 12', '1', '2020-03-01', 5000),
('4', 'xiaomi 13', '1', '2020-04-01', 6000),
('5', '破壁機', '2', '2020-01-01', 500),
('6', '洗碗機', '2', '2020-02-01', 2000),
('7', '熱水壺', '2', '2020-03-01', 100),
('8', '微波爐', '2', '2020-04-01', 600),
('9', '自行車', '3', '2020-01-01', 1000),
('10', '帳篷', '3', '2020-02-01', 100),
('11', '燒烤架', '3', '2020-02-01', 50),
('12', '遮陽傘', '3', '2020-03-01', 20);
1.3 商品分類信息表
1)表結構
category_id(分類id) | category_name(分類名稱) |
---|---|
1 | 數碼 |
2 | 廚衛 |
3 | 戶外 |
2)建表語句
hive>
DROP TABLE IF EXISTS category_info;
create table category_info(
`category_id` string,
`category_name` string
) COMMENT '品類表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table category_info
values ('1','數碼'),
('2','廚衛'),
('3','戶外');
1.4 訂單信息表
1)表結構
order_id(訂單id) | user_id(用戶id) | create_date(下單日期) | total_amount(訂單金額) |
---|---|---|---|
1 | 101 | 2021-09-30 | 29000.00 |
10 | 103 | 2020-10-02 | 28000.00 |
2)建表語句
hive>
DROP TABLE IF EXISTS order_info;
create table order_info(
`order_id` string COMMENT '訂單id',
`user_id` string COMMENT '用戶id',
`create_date` string COMMENT '下單日期',
`total_amount` decimal(16, 2) COMMENT '訂單總金額'
) COMMENT '訂單表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table order_info
values ('1', '101', '2021-09-27', 29000.00),
('2', '101', '2021-09-28', 70500.00),
('3', '101', '2021-09-29', 43300.00),
('4', '101', '2021-09-30', 860.00),
('5', '102', '2021-10-01', 46180.00),
('6', '102', '2021-10-01', 50000.00),
('7', '102', '2021-10-01', 75500.00),
('8', '102', '2021-10-02', 6170.00),
('9', '103', '2021-10-02', 18580.00),
('10', '103', '2021-10-02', 28000.00),
('11', '103', '2021-10-02', 23400.00),
('12', '103', '2021-10-03', 5910.00),
('13', '104', '2021-10-03', 13000.00),
('14', '104', '2021-10-03', 69500.00),
('15', '104', '2021-10-03', 2000.00),
('16', '104', '2021-10-03', 5380.00),
('17', '105', '2021-10-04', 6210.00),
('18', '105', '2021-10-04', 68000.00),
('19', '105', '2021-10-04', 43100.00),
('20', '105', '2021-10-04', 2790.00),
('21', '106', '2021-10-04', 9390.00),
('22', '106', '2021-10-05', 58000.00),
('23', '106', '2021-10-05', 46600.00),
('24', '106', '2021-10-05', 5160.00),
('25', '107', '2021-10-05', 55350.00),
('26', '107', '2021-10-05', 14500.00),
('27', '107', '2021-10-06', 47400.00),
('28', '107', '2021-10-06', 6900.00),
('29', '108', '2021-10-06', 56570.00),
('30', '108', '2021-10-06', 44500.00),
('31', '108', '2021-10-07', 50800.00),
('32', '108', '2021-10-07', 3900.00),
('33', '109', '2021-10-07', 41480.00),
('34', '109', '2021-10-07', 88000.00),
('35', '109', '2020-10-08', 15000.00),
('36', '109', '2020-10-08', 9020.00),
('37', '1010', '2020-10-08', 9260.00),
('38', '1010', '2020-10-08', 12000.00),
('39', '1010', '2020-10-08', 23900.00),
('40', '1010', '2020-10-08', 6790.00);
1.5 訂單明細表
1)表結
order_detail_id(訂單明細id) | order_id(訂單id) | sku_id(商品id) | create_date(下單日期) | price(商品單價) | sku_num(商品件數) |
---|---|---|---|---|---|
1 | 1 | 1 | 2021-09-30 | 2000.00 | 2 |
2 | 1 | 3 | 2021-09-30 | 5000.00 | 5 |
22 | 10 | 4 | 2020-10-02 | 6000.00 | 1 |
23 | 10 | 5 | 2020-10-02 | 500.00 | 24 |
24 | 10 | 6 | 2020-10-02 | 2000.00 | 5 |
2)建表語句
hive>
DROP TABLE IF EXISTS order_detail;
CREATE TABLE order_detail
(
`order_detail_id` string COMMENT '訂單明細id',
`order_id` string COMMENT '訂單id',
`sku_id` string COMMENT '商品id',
`create_date` string COMMENT '下單日期',
`price` decimal(16, 2) COMMENT '下單時的商品單價',
`sku_num` int COMMENT '下單商品件數'
) COMMENT '訂單明細表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
INSERT overwrite table order_detail
values ('1', '1', '1', '2021-09-27', 2000.00, 2),
('2', '1', '3', '2021-09-27', 5000.00, 5),
('3', '2', '4', '2021-09-28', 6000.00, 9),
('4', '2', '5', '2021-09-28', 500.00, 33),
('5', '3', '7', '2021-09-29', 100.00, 37),
('6', '3', '8', '2021-09-29', 600.00, 46),
('7', '3', '9', '2021-09-29', 1000.00, 12),
('8', '4', '12', '2021-09-30', 20.00, 43),
('9', '5', '1', '2021-10-01', 2000.00, 8),
('10', '5', '2', '2021-10-01', 10.00, 18),
('11', '5', '3', '2021-10-01', 5000.00, 6),
('12', '6', '4', '2021-10-01', 6000.00, 8),
('13', '6', '6', '2021-10-01', 2000.00, 1),
('14', '7', '7', '2021-10-01', 100.00, 17),
('15', '7', '8', '2021-10-01', 600.00, 48),
('16', '7', '9', '2021-10-01', 1000.00, 45),
('17', '8', '10', '2021-10-02', 100.00, 48),
('18', '8', '11', '2021-10-02', 50.00, 15),
('19', '8', '12', '2021-10-02', 20.00, 31),
('20', '9', '1', '2021-09-30', 2000.00, 9),
('21', '9', '2', '2021-10-02', 10.00, 5800),
('22', '10', '4', '2021-10-02', 6000.00, 1),
('23', '10', '5', '2021-10-02', 500.00, 24),
('24', '10', '6', '2021-10-02', 2000.00, 5),
('25', '11', '8', '2021-10-02', 600.00, 39),
('26', '12', '10', '2021-10-03', 100.00, 47),
('27', '12', '11', '2021-10-03', 50.00, 19),
('28', '12', '12', '2021-10-03', 20.00, 13000),
('29', '13', '1', '2021-10-03', 2000.00, 4),
('30', '13', '3', '2021-10-03', 5000.00, 1),
('31', '14', '4', '2021-10-03', 6000.00, 5)
('32', '14', '5', '2021-10-03', 500.00, 47),
('33', '14', '6', '2021-10-03', 2000.00, 8),
('34', '15', '7', '2021-10-03', 100.00, 20),
('35', '16', '10', '2021-10-03', 100.00, 22),
('36', '16', '11', '2021-10-03', 50.00, 42),
('37', '16', '12', '2021-10-03', 20.00, 7400),
('38', '17', '1', '2021-10-04', 2000.00, 3),
('39', '17', '2', '2021-10-04', 10.00, 21),
('40', '18', '4', '2021-10-04', 6000.00, 8),
('41', '18', '5', '2021-10-04', 500.00, 28),
('42', '18', '6', '2021-10-04', 2000.00, 3),
('43', '19', '7', '2021-10-04', 100.00, 55),
('44', '19', '8', '2021-10-04', 600.00, 11),
('45', '19', '9', '2021-10-04', 1000.00, 31),
('46', '20', '11', '2021-10-04', 50.00, 45),
('47', '20', '12', '2021-10-04', 20.00, 27)
('48', '21', '1', '2021-10-04', 2000.00, 2),
('49', '21', '2', '2021-10-04', 10.00, 39),
('50', '21', '3', '2021-10-04', 5000.00, 1),
('51', '22', '4', '2021-10-05', 6000.00, 8),
('52', '22', '5', '2021-10-05', 500.00, 20),
('53', '23', '7', '2021-10-05', 100.00, 58),
('54', '23', '8', '2021-10-05', 600.00, 18),
('55', '23', '9', '2021-10-05', 1000.00, 30),
('56', '24', '10', '2021-10-05', 100.00, 27),
('57', '24', '11', '2021-10-05', 50.00, 28),
('58', '24', '12', '2021-10-05', 20.00, 53),
('59', '25', '1', '2021-10-05', 2000.00, 5),
('60', '25', '2', '2021-10-05', 10.00, 35),
('61', '25', '3', '2021-10-05', 5000.00, 9),
('62', '26', '4', '2021-10-05', 6000.00, 1),
('63', '26', '5', '2021-10-05', 500.00, 13),
('64', '26', '6', '2021-10-05', 2000.00, 1),
('65', '27', '7', '2021-10-06', 100.00, 30),
('66', '27', '8', '2021-10-06', 600.00, 19),
('67', '27', '9', '2021-10-06', 1000.00, 33),
('68', '28', '10', '2021-10-06', 100.00, 37),
('69', '28', '11', '2021-10-06', 50.00, 46),
('70', '28', '12', '2021-10-06', 20.00, 45),
('71', '29', '1', '2021-10-06', 2000.00, 8),
('72', '29', '2', '2021-10-06', 10.00, 57),
('73', '29', '3', '2021-10-06', 5000.00, 8),
('74', '30', '4', '2021-10-06', 6000.00, 3),
('75', '30', '5', '2021-10-06', 500.00, 33),
('76', '30', '6', '2021-10-06', 2000.00, 5),
('77', '31', '8', '2021-10-07', 600.00, 13),
('78', '31', '9', '2021-10-07', 1000.00, 43),
('79', '32', '10', '2021-10-07', 100.00, 24),
('80', '32', '11', '2021-10-07', 50.00, 30),
('81', '33', '1', '2021-10-07', 2000.00, 8),
('82', '33', '2', '2021-10-07', 10.00, 48),
('83', '33', '3', '2021-10-07', 5000.00, 5),
('84', '34', '4', '2021-10-07', 6000.00, 10),
('85', '34', '5', '2021-10-07', 500.00, 44),
('86', '34', '6', '2021-10-07', 2000.00, 3),
('87', '35', '8', '2020-10-08', 600.00, 25),
('88', '36', '10', '2020-10-08', 100.00, 57),
('89', '36', '11', '2020-10-08', 50.00, 44),
('90', '36', '12', '2020-10-08', 20.00, 56),
('91', '37', '1', '2020-10-08', 2000.00, 2),
('92', '37', '2', '2020-10-08', 10.00, 26),
('93', '37', '3', '2020-10-08', 5000.00, 1),
('94', '38', '6', '2020-10-08', 2000.00, 6),
('95', '39', '7', '2020-10-08', 100.00, 35),
('96', '39', '8', '2020-10-08', 600.00, 34),
('97', '40', '10', '2020-10-08', 100.00, 37),
('98', '40', '11', '2020-10-08', 50.00, 51),
('99', '40', '12', '2020-10-08', 20.00, 27);
1.6 登錄明細表
1)表結構
user_id(用戶id) | ip_address(ip地址) | login_ts(登錄時間) | logout_ts(登出時間) |
---|---|---|---|
101 | 180.149.130.161 | 2021-09-21 08:00:00 | 2021-09-27 08:30:00 |
102 | 120.245.11.2 | 2021-09-22 09:00:00 | 2021-09-27 09:30:00 |
103 | 27.184.97.3 | 2021-09-23 10:00:00 | 2021-09-27 10:30:00 |
2)建表語句
hive>
DROP TABLE IF EXISTS user_login_detail;
CREATE TABLE user_login_detail
(
`user_id` string comment '用戶id',
`ip_address` string comment 'ip地址',
`login_ts` string comment '登錄時間',
`logout_ts` string comment '登出時間'
) COMMENT '用戶登錄明細表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
INSERT overwrite table user_login_detail
VALUES ('101', '180.149.130.161', '2021-09-21 08:00:00', '2021-09-27 08:30:00'),
('101', '180.149.130.161', '2021-09-27 08:00:00', '2021-09-27 08:30:00'),
('101', '180.149.130.161', '2021-09-28 09:00:00', '2021-09-28 09:10:00'),
('101', '180.149.130.161', '2021-09-29 13:30:00', '2021-09-29 13:50:00'),
('101', '180.149.130.161', '2021-09-30 20:00:00', '2021-09-30 20:10:00'),
('102', '120.245.11.2', '2021-09-22 09:00:00', '2021-09-27 09:30:00'),
('102', '120.245.11.2', '2021-10-01 08:00:00', '2021-10-01 08:30:00'),
('102', '180.149.130.174', '2021-10-01 07:50:00', '2021-10-01 08:20:00'),
('102', '120.245.11.2', '2021-10-02 08:00:00', '2021-10-02 08:30:00'),
('103', '27.184.97.3', '2021-09-23 10:00:00', '2021-09-27 10:30:00'),
('103', '27.184.97.3', '2021-10-03 07:50:00', '2021-10-03 09:20:00'),
('104', '27.184.97.34', '2021-09-24 11:00:00', '2021-09-27 11:30:00'),
('104', '27.184.97.34', '2021-10-03 07:50:00', '2021-10-03 08:20:00'),
('104', '27.184.97.34', '2021-10-03 08:50:00', '2021-10-03 10:20:00'),
('104', '120.245.11.89', '2021-10-03 08:40:00', '2021-10-03 10:30:00'),
('105', '119.180.192.212', '2021-10-04 09:10:00', '2021-10-04 09:30:00'),
('106', '119.180.192.66', '2021-10-04 08:40:00', '2021-10-04 10:30:00'),
('106', '119.180.192.66', '2021-10-05 21:50:00', '2021-10-05 22:40:00'),
('107', '219.134.104.7', '2021-09-25 12:00:00', '2021-09-27 12:30:00'),
('107', '219.134.104.7', '2021-10-05 22:00:00', '2021-10-05 23:00:00'),
('107', '219.134.104.7', '2021-10-06 09:10:00', '2021-10-06 10:20:00'),
('107', '27.184.97.46', '2021-10-06 09:00:00', '2021-10-06 10:00:00'),
('108', '101.227.131.22', '2021-10-06 09:00:00', '2021-10-06 10:00:00'),
('108', '101.227.131.22', '2021-10-06 22:00:00', '2021-10-06 23:00:00'),
('109', '101.227.131.29', '2021-09-26 13:00:00', '2021-09-27 13:30:00'),
('109', '101.227.131.29', '2021-10-06 08:50:00', '2021-10-06 10:20:00'),
('109', '101.227.131.29', '2021-10-08 09:00:00', '2021-10-08 09:10:00'),
('1010', '119.180.192.10', '2021-09-27 14:00:00', '2021-09-27 14:30:00'),
('1010', '119.180.192.10', '2021-10-09 08:50:00', '2021-10-09 10:20:00');
1.7 商品價格變更明細表
1)表結構
sku_id(商品id) | new_price(本次變更之後的價格) | change_date(變更日期) |
---|---|---|
1 | 1900.00 | 2021-09-25 |
1 | 2000.00 | 2021-09-26 |
2 | 80.00 | 2021-09-29 |
2 | 10.00 | 2021-09-30 |
2)建表語句
hive>
DROP TABLE IF EXISTS sku_price_modify_detail;
CREATE TABLE sku_price_modify_detail
(
`sku_id` string comment '商品id',
`new_price` decimal(16, 2) comment '更改後的價格',
`change_date` string comment '變動日期'
) COMMENT '商品價格變更明細表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table sku_price_modify_detail
values ('1', 1900, '2021-09-25'),
('1', 2000, '2021-09-26'),
('2', 80, '2021-09-29'),
('2', 10, '2021-09-30'),
('3', 4999, '2021-09-25'),
('3', 5000, '2021-09-26'),
('4', 5600, '2021-09-26'),
('4', 6000, '2021-09-27'),
('5', 490, '2021-09-27'),
('5', 500, '2021-09-28'),
('6', 1988, '2021-09-30'),
('6', 2000, '2021-10-01'),
('7', 88, '2021-09-28'),
('7', 100, '2021-09-29'),
('8', 800, '2021-09-28'),
('8', 600, '2021-09-29'),
('9', 1100, '2021-09-27'),
('9', 1000, '2021-09-28'),
('10', 90, '2021-10-01'),
('10', 100, '2021-10-02'),
('11', 66, '2021-10-01'),
('11', 50, '2021-10-02'),
('12', 35, '2021-09-28'),
('12', 20, '2021-09-29');
1.8 配送信息表
1)表結構
delivery_id(運單id) | order_id(訂單id) | user_id(用戶id) | order_date(下單日期) | custom_date(期望配送日期) |
---|---|---|---|---|
1 | 1 | 101 | 2021-09-27 | 2021-09-29 |
2 | 2 | 101 | 2021-09-28 | 2021-09-28 |
3 | 3 | 101 | 2021-09-29 | 2021-09-30 |
2)建表語句
hive>
DROP TABLE IF EXISTS delivery_info;
CREATE TABLE delivery_info
(
`delivery_id` string comment '配送單id',
`order_id` string comment '訂單id',
`user_id` string comment '用戶id',
`order_date` string comment '下單日期',
`custom_date` string comment '期望配送日期'
) COMMENT '郵寄信息表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table delivery_info
values ('1', '1', '101', '2021-09-27', '2021-09-29'),
('2', '2', '101', '2021-09-28', '2021-09-28'),
('3', '3', '101', '2021-09-29', '2021-09-30'),
('4', '4', '101', '2021-09-30', '2021-10-01'),
('5', '5', '102', '2021-10-01', '2021-10-01'),
('6', '6', '102', '2021-10-01', '2021-10-01'),
('7', '7', '102', '2021-10-01', '2021-10-03'),
('8', '8', '102', '2021-10-02', '2021-10-02'),
('9', '9', '103', '2021-10-02', '2021-10-03'),
('10', '10', '103', '2021-10-02', '2021-10-04'),
('11', '11', '103', '2021-10-02', '2021-10-02'),
('12', '12', '103', '2021-10-03', '2021-10-03'),
('13', '13', '104', '2021-10-03', '2021-10-04'),
('14', '14', '104', '2021-10-03', '2021-10-04'),
('15', '15', '104', '2021-10-03', '2021-10-03'),
('16', '16', '104', '2021-10-03', '2021-10-03'),
('17', '17', '105', '2021-10-04', '2021-10-04'),
('18', '18', '105', '2021-10-04', '2021-10-06'),
('19', '19', '105', '2021-10-04', '2021-10-06'),
('20', '20', '105', '2021-10-04', '2021-10-04'),
('21', '21', '106', '2021-10-04', '2021-10-04'),
('22', '22', '106', '2021-10-05', '2021-10-05'),
('23', '23', '106', '2021-10-05', '2021-10-05'),
('24', '24', '106', '2021-10-05', '2021-10-07'),
('25', '25', '107', '2021-10-05', '2021-10-05'),
('26', '26', '107', '2021-10-05', '2021-10-06'),
('27', '27', '107', '2021-10-06', '2021-10-06'),
('28', '28', '107', '2021-10-06', '2021-10-07'),
('29', '29', '108', '2021-10-06', '2021-10-06'),
('30', '30', '108', '2021-10-06', '2021-10-06'),
('31', '31', '108', '2021-10-07', '2021-10-09'),
('32', '32', '108', '2021-10-07', '2021-10-09'),
('33', '33', '109', '2021-10-07', '2021-10-08'),
('34', '34', '109', '2021-10-07', '2021-10-08'),
('35', '35', '109', '2021-10-08', '2021-10-10'),
('36', '36', '109', '2021-10-08', '2021-10-09'),
('37', '37', '1010', '2021-10-08', '2021-10-10'),
('38', '38', '1010', '2021-10-08', '2021-10-10'),
('39', '39', '1010', '2021-10-08', '2021-10-09'),
('40', '40', '1010', '2021-10-08', '2021-10-09');
1.9 好友關係表
1)表結構
user1_id(用戶1 id) | user2_id(用戶2 id) |
---|---|
101 | 1010 |
101 | 108 |
101 | 106 |
注:表中一行數據中的兩個user_id,表示兩個用戶互爲好友。
2)建表語句
hive>
DROP TABLE IF EXISTS friendship_info;
CREATE TABLE friendship_info(
`user1_id` string comment '用戶1id',
`user2_id` string comment '用戶2id'
) COMMENT '用戶關係表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table friendship_info
values ('101', '1010'),
('101', '108'),
('101', '106'),
('101', '104'),
('101', '102'),
('102', '1010'),
('102', '108'),
('102', '106'),
('102', '104'),
('102', '102'),
('103', '1010'),
('103', '108'),
('103', '106'),
('103', '104'),
('103', '102'),
('104', '1010'),
('104', '108'),
('104', '106'),
('104', '104'),
('104', '102'),
('105', '1010'),
('105', '108'),
('105', '106'),
('105', '104'),
('105', '102'),
('106', '1010'),
('106', '108'),
('106', '106'),
('106', '104'),
('106', '102'),
('107', '1010'),
('107', '108'),
('107', '106'),
('107', '104'),
('107', '102'),
('108', '1010'),
('108', '108'),
('108', '106'),
('108', '104'),
('108', '102'),
('109', '1010'),
('109', '108'),
('109', '106'),
('109', '104'),
('109', '102'),
('1010', '1010'),
('1010', '108'),
('1010', '106'),
('1010', '104'),
('1010', '102');
1.10 收藏信息表
1)表結構
user_id(用戶id) | sku_id(商品id) | create_date(收藏日期) |
---|---|---|
101 | 3 | 2021-09-23 |
101 | 12 | 2021-09-23 |
101 | 6 | 2021-09-25 |
2)建表語句
hive>
DROP TABLE IF EXISTS favor_info;
CREATE TABLE favor_info
(
`user_id` string comment '用戶id',
`sku_id` string comment '商品id',
`create_date` string comment '收藏日期'
) COMMENT '用戶收藏表'
ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';
3)數據裝載
hive>
insert overwrite table favor_info
values ('101', '3', '2021-09-23'),
('101', '12', '2021-09-23'),
('101', '6', '2021-09-25'),
('101', '10', '2021-09-21'),
('101', '5', '2021-09-25'),
('102', '1', '2021-09-24'),
('102', '2', '2021-09-24'),
('102', '8', '2021-09-23'),
('102', '12', '2021-09-22'),
('102', '11', '2021-09-23'),
('102', '9', '2021-09-25'),
('102', '4', '2021-09-25'),
('102', '6', '2021-09-23'),
('102', '7', '2021-09-26'),
('103', '8', '2021-09-24'),
('103', '5', '2021-09-25'),
('103', '6', '2021-09-26'),
('103', '12', '2021-09-27'),
('103', '7', '2021-09-25'),
('103', '10', '2021-09-25'),
('103', '4', '2021-09-24'),
('103', '11', '2021-09-25'),
('103', '3', '2021-09-27'),
('104', '9', '2021-09-28'),
('104', '7', '2021-09-28'),
('104', '8', '2021-09-25'),
('104', '3', '2021-09-28'),
('104', '11', '2021-09-25'),
('104', '6', '2021-09-25'),
('104', '12', '2021-09-28'),
('105', '8', '2021-10-08'),
('105', '9', '2021-10-07'),
('105', '7', '2021-10-07'),
('105', '11', '2021-10-06'),
('105', '5', '2021-10-07'),
('105', '4', '2021-10-05'),
('105', '10', '2021-10-07'),
('106', '12', '2021-10-08'),
('106', '1', '2021-10-08'),
('106', '4', '2021-10-04'),
('106', '5', '2021-10-08'),
('106', '2', '2021-10-04'),
('106', '6', '2021-10-04'),
('106', '7', '2021-10-08'),
('107', '5', '2021-09-29'),
('107', '3', '2021-09-28'),
('107', '10', '2021-09-27'),
('108', '9', '2021-10-08'),
('108', '3', '2021-10-10'),
('108', '8', '2021-10-10'),
('108', '10', '2021-10-07'),
('108', '11', '2021-10-07'),
('109', '2', '2021-09-27'),
('109', '4', '2021-09-29'),
('109', '5', '2021-09-29'),
('109', '9', '2021-09-30'),
('109', '8', '2021-09-26'),
('1010', '2', '2021-09-29'),
('1010', '9', '2021-09-29'),
('1010', '1', '2021-10-01');
第二章 練習題
2.1 查詢累積銷量排名第二的商品
2.1.1 題目需求
查詢訂單明細表(order_detail)中銷量(下單件數)排名第二的商品id,如果不存在返回null,如果存在多個排名第二的商品則需要全部返回。期望結果如下:
sku_id |
---|
11 |
2.1.2 代碼實現
hive>
select sku_id
from (
select sku_id
from (
select sku_id,
order_num,
-- 窗口函數要排序不要分區
dense_rank() over (order by order_num desc) rk
from (
-- 每個商品的銷量
select sku_id, sum(sku_num) order_num
from order_detail
group by sku_id
) t1
) t2
where rk = 2
) t3
right join -- 爲保證,沒有第二名的情況下,返回null
(
select 1
) t4
on 1 = 1;
窗口排序函數
-
row_number:在每個分組中,爲每行分配一個從1開始的唯一序列號,遞增,不考慮重複;(1234567……)
-
rank: 在每個分組中,爲每行分配一個從1開始的序列號,考慮重複,擠佔後續位置;(12225……)
-
dense_rank: 在每個分組中,爲每行分配一個從1開始的序列號,考慮重複,不擠佔後續位置(12223……)
也有人會將分組聚合子查詢和窗口排序子查詢合在一起,要理解這種方法,但可讀性不高
select sku_id,
sum(sku_num) order_num,
-- 窗口函數要排序不要分區
dense_rank() over (order by sum(sku_num) desc) rk
from order_detail
group by sku_id
2.2 查詢至少連續三天下單的用戶
2.2.1 題目需求
查詢訂單信息表(order_info)中最少連續3天下單的用戶id,期望結果如下:
user_id |
---|
101 |
2.2.2 代碼實現
hive中的去重
-
distinct關鍵字
-
group by
-
窗口函數排序(思路同上一題,按分區窗口排序後,取第一個)
方案1
使用lead滑窗,datediff求差值
select distinct user_id
from (
select
user_id,
datediff(lead2,create_date) diff
from (
select
user_id,
create_date,
lead(create_date, 2,'9999-12-31') over (partition by user_id order by create_date) lead2
from (
select
distinct
user_id,
create_date
from order_info
) t1
) t2
) t3
where diff=2;
方案2
統計相同日期差值的個數
select distinct user_id
from (
select user_id
from (
select user_id
, create_date
, date_sub(create_date, row_number() over (partition by user_id order by create_date)) diff
from (
select user_id, create_date
from order_info
-- group by 去重
group by user_id, create_date
) t1 -- 同一天可能多個用戶下單,進行去重
) t2 -- 判斷一串日期是否連續:若連續,用這個日期減去它的排名,會得到一個相同的結果
group by user_id, diff
having count(diff) >= 3 -- 連續下單大於等於三天
) t3;
date函數
-
datediff('endTime',‘startTime’)返回前後日期之間的天數差
hive> select datediff('2019-06-25','2019-06-20'); 5 hive> select datediff('2019-06-25','2019-06-27'); -2
-
date_sub(‘yyyy-MM-dd’,n/-m)返回初始日期n天前、m天后的日期
hive> select date_sub('2019-06-25',4); 2019-06-21 hive> select date_sub('2019-06-25',-2); 2019-06-27
方案3
利用滑動窗口的範圍
select distinct user_id
from (
select user_id,
ts,
-- 86400秒 一天
count(*) over(partition by user_id order by ts range between 86400 preceding and 86400 following) cnt
from (
select user_id,
-- 時間字符串轉爲時間戳 秒
unix_timestamp(create_date,'yyyy-MM-dd') ts
from (
select
distinct
user_id, create_date
from order_info
) t1
) t2
) t3
where cnt=3;
2.3 查詢各品類銷售商品的種類數及銷量最高的商品
2.3.1 題目需求
從訂單明細表(order_detail)統計各品類銷售出的商品種類數及累積銷量最好的商品,期望結果如下:
category_id(分類id) | category_name(分類名稱) | sku_id(銷量最好的商品id) | name(商品名稱) | order_num(銷量最好的商品銷量) | order_cnt(商品種類數量) |
---|---|---|---|---|---|
1 | 數碼 | 2 | 手機殼 | 302 | 4 |
2 | 廚衛 | 8 | 微波爐 | 253 | 4 |
3 | 戶外 | 12 | 遮陽傘 | 349 | 4 |
2.3.2 代碼實現
select category_id,
category_name,
sku_id,
name,
order_num,
sku_cnt
from (
select od.sku_id,
sku.name,
sku.category_id,
cate.category_name,
order_num,
-- 分組開窗 依舊是Top K問題
rank() over (partition by sku.category_id order by order_num desc) rk,
-- 使用開窗保證每一行都要有一個count
count(distinct od.sku_id) over (partition by sku.category_id) sku_cnt
from (
-- 商品銷量
select sku_id,
sum(sku_num) order_num
from order_detail
group by sku_id
) od
-- 商品信息表sku_info中有商品名稱 分類信息
left join sku_info sku on od.sku_id = sku.sku_id
-- 商品分類信息表category_info中有分類名稱
left join category_info cate on sku.category_id = cate.category_id
) t1
where rk = 1;
2.4 查詢用戶的累計消費金額及VIP等級
2.4.1 題目需求
從訂單信息表(order_info)中統計每個用戶截止其每個下單日期的累積消費金額,以及每個用戶在其每個下單日期的VIP等級。
用戶vip等級根據累積消費金額計算,計算規則如下:
設累積消費總額爲X,
若0=<X<10000,則vip等級爲普通會員
若10000<=X<30000,則vip等級爲青銅會員
若30000<=X<50000,則vip等級爲白銀會員
若50000<=X<80000,則vip爲黃金會員
若80000<=X<100000,則vip等級爲白金會員
若X>=100000,則vip等級爲鑽石會員
期望結果如下:
user_id(用戶id) | create_date(下單日期) | sum_so_far(截至每個下單日期的累計下單金額) | vip_level(每個下單日期的VIP等級) |
---|---|---|---|
101 | 2021-09-27 | 29000.00 | 青銅會員 |
101 | 2021-09-28 | 99500.00 | 白金會員 |
101 | 2021-09-29 | 142800.00 | 鑽石會員 |
101 | 2021-09-30 | 143660.00 | 鑽石會員 |
102 | 2021-10-01 | 171680.00 | 鑽石會員 |
102 | 2021-10-02 | 177850.00 | 鑽石會員 |
103 | 2021-10-02 | 69980.00 | 黃金會員 |
103 | 2021-10-03 | 75890.00 | 黃金會員 |
104 | 2021-10-03 | 89880.00 | 白金會員 |
105 | 2021-10-04 | 120100.00 | 鑽石會員 |
106 | 2021-10-04 | 9390.00 | 普通會員 |
106 | 2021-10-05 | 119150.00 | 鑽石會員 |
107 | 2021-10-05 | 69850.00 | 黃金會員 |
107 | 2021-10-06 | 124150.00 | 鑽石會員 |
108 | 2021-10-06 | 101070.00 | 鑽石會員 |
108 | 2021-10-07 | 155770.00 | 鑽石會員 |
109 | 2021-10-07 | 129480.00 | 鑽石會員 |
109 | 2021-10-08 | 153500.00 | 鑽石會員 |
1010 | 2021-10-08 | 51950.00 | 黃金會員 |
2.4.2 代碼實現
select user_id,
create_date,
sum_so_far,
case
when sum_so_far >= 100000 then '鑽石會員'
when sum_so_far >= 80000 then '白金會員'
when sum_so_far >= 50000 then '黃金會員'
when sum_so_far >= 30000 then '白銀會員'
when sum_so_far >= 10000 then '青銅會員'
when sum_so_far >= 0 then '普通會員'
end vip_level
from (
select user_id,
create_date,
-- 開窗計算累積金額
-- rang 負無窮到當前行
sum(total_amount_per_day) over (partition by user_id order by create_date rows between unbounded preceding and current row) sum_so_far
from (
-- 每個用戶在每天的消費金額
select user_id,
create_date,
sum(total_amount) total_amount_per_day
from order_info
group by user_id, create_date
) t1
) t2;
2.5 查詢首次下單後第二天連續下單的用戶比率
2.5.1 題目需求
從訂單信息表(order_info)中查詢首次下單後第二天仍然下單的用戶佔所有下單用戶的比例,結果保留一位小數,使用百分數顯示,期望結果如下:
percentage |
---|
70.0% |
2.5.2 代碼實現
先找到每個用戶的前兩個下單日期
select
concat(round(sum(if(datediff(buy_date_second, buy_date_first) = 1, 1, 0)) / count(*) * 100, 1), '%') percentage
from (
select user_id,
min(create_date) buy_date_first,
max(create_date) buy_date_second
from (
-- 所有下單日期裏面的前兩個
select user_id,
create_date,
rank() over (partition by user_id order by create_date) rk
from (
select
distinct
user_id,
create_date
from order_info
) t1
) t2
where rk <= 2
group by user_id
) t3;
2.6 每個商品銷售首年的年份、銷售數量和銷售金額
2.6.1 題目需求
從訂單明細表(order_detail)統計每個商品銷售首年的年份,銷售數量和銷售總額。
期望結果如下:
sku_id(商品id) | year(銷售首年年份) | order_num(首年銷量) | order_amount(首年銷售金額) |
---|---|---|---|
1 | 2021 | 51 | 102000.00 |
2 | 2021 | 302 | 3020.00 |
3 | 2021 | 36 | 180000.00 |
4 | 2021 | 53 | 318000.00 |
5 | 2021 | 242 | 121000.00 |
6 | 2021 | 32 | 64000.00 |
7 | 2021 | 252 | 25200.00 |
8 | 2021 | 253 | 151800.00 |
9 | 2021 | 194 | 194000.00 |
10 | 2021 | 299 | 29900.00 |
11 | 2021 | 320 | 16000.00 |
12 | 2021 | 349 | 6980.00 |
2.6.2 代碼實現
依舊是Top K排序問題,得到每個商品最早的銷售時間
select sku_id,
year(create_date) as year,
sum(sku_num) as order_num,
sum(price*sku_num) as order_amount
from (
select order_id,
sku_id,
price,
sku_num,
create_date,
rank() over (partition by sku_id order by year(create_date)) rk
from order_detail
) t1
where rk = 1
group by sku_id,year(create_date);
最開始的思路是直接使用聚合函數
select sku_id,
min(year(create_date)) as year,
sum(sku_num) as order_num,
sum(price*sku_num) as order_amount
from order_detail
group by sku_id;
但,發現無法得到商品最早的銷售時間,這隻能使用開窗函數,注意與2.8的區別
2.7 篩選去年總銷量小於100的商品
2.7.1 題目需求
從訂單明細表(order_detail)中篩選出去年總銷量小於100的商品及其銷量,假設今天的日期是2022-01-10,不考慮上架時間小於一個月的商品,期望結果如下:
sku_id(商品id) | name(商品名稱) | order_num(銷量) |
---|---|---|
1 | xiaomi 10 | 51 |
3 | apple 12 | 36 |
4 | xiaomi 13 | 53 |
6 | 洗碗機 | 32 |
2.7.2 代碼實現
select t1.sku_id,
t2.name,
order_num
from (
select sku_id,
sum(sku_num) order_num
from order_detail
where year(create_date) = '2021'
and sku_id in (
-- 上架時間超過一個月的商品
select sku_id
from sku_info
where datediff('2022-01-10', from_date) > 30
)
group by sku_id
having sum(sku_num) < 100
) t1
left join
sku_info t2
on t1.sku_id = t2.sku_id;
2.8 查詢每日新用戶數
2.8.1 題目需求
從用戶登錄明細表(user_login_detail)中查詢每天的新增用戶數,若一個用戶在某天登錄了,且在這一天之前沒登錄過,則任務該用戶爲這一天的新增用戶。期望結果如下:
login_date_first(日期) | user_count(新增用戶數) |
---|---|
2021-09-21 | 1 |
2021-09-22 | 1 |
2021-09-23 | 1 |
2021-09-24 | 1 |
2021-09-25 | 1 |
2021-09-26 | 1 |
2021-09-27 | 1 |
2021-10-04 | 2 |
2021-10-06 | 1 |
2.8.2 代碼實現
select
login_date_first,
count(*) user_count
from
(
select
user_id,
min(date_format(login_ts,'yyyy-MM-dd')) login_date_first
from user_login_detail
group by user_id
)t1
group by login_date_first;
2.9 統計每個商品的銷量最高的日期
2.9.1 題目需求
從訂單明細表(order_detail)中統計出每種商品銷售件數最多的日期及當日銷量,如果有同一商品多日銷量並列的情況,取其中的最小日期。期望結果如下:
sku_id(商品id) | create_date(銷量最高的日期) | sum_num(銷量) |
---|---|---|
1 | 2021-10-02 | 9 |
2 | 2021-10-04 | 60 |
3 | 2021-10-05 | 9 |
4 | 2021-10-07 | 10 |
5 | 2021-10-03 | 47 |
6 | 2021-10-03 | 8 |
7 | 2021-10-05 | 58 |
8 | 2021-10-08 | 59 |
9 | 2021-10-01 | 45 |
10 | 2021-10-08 | 94 |
11 | 2021-10-08 | 95 |
12 | 2021-10-08 | 83 |
2.9.2 代碼實現
select sku_id,
create_date,
sum_num
from (
select sku_id,
create_date,
sum_num,
row_number() over (partition by sku_id order by sum_num desc,create_date asc) rn
from (
-- 統計 每個商品當日銷售量
select sku_id,
create_date,
sum(sku_num) sum_num
from order_detail
group by sku_id, create_date
) t1
) t2
where rn = 1;
2.10 查詢銷售件數高於品類平均數的商品
2.10.1 題目需求
從訂單明細表(order_detail)中查詢累積銷售件數高於其所屬品類平均數的商品,期望結果如下:
sku_id | name | sum_num | cate_avg_num |
---|---|---|---|
2 | 手機殼 | 302 | 110.5 |
5 | 破壁機 | 242 | 194.75 |
7 | 熱水壺 | 252 | 194.75 |
8 | 微波爐 | 253 | 194.75 |
10 | 帳篷 | 299 | 290.5 |
11 | 燒烤架 | 320 | 290.5 |
12 | 遮陽傘 | 349 | 290.5 |
2.10.2 代碼實現
select sku_id,
name,
sum_num,
cate_avg_num
from (
select od.sku_id,
category_id,
name,
sum_num,
-- 開窗函數 計算每個類別中的商品平均銷量
avg(sum_num) over (partition by category_id) cate_avg_num
from (
-- 每個商品的總銷售數
select sku_id,
sum(sku_num) sum_num
from order_detail
group by sku_id
) od
-- 左連接上類別
left join(
select sku_id,
name,
category_id
from sku_info
) sku
on od.sku_id = sku.sku_id) t1
where sum_num > cate_avg_num;
2.11 用戶註冊、登錄、下單綜合統計
2.11.1 題目需求
從用戶登錄明細表(user_login_detail)和訂單信息表(order_info)中查詢每個用戶的註冊日期(首次登錄日期)、總登錄次數以及其在2021年的登錄次數、訂單數和訂單總額。期望結果如下:
user_id(用戶id) | register_date(註冊日期) | total_login_count(累積登錄次數) | login_count_2021(2021年登錄次數) | order_count_2021(2021年下單次數) | order_amount_2021(2021年訂單金額) |
---|---|---|---|---|---|
101 | 2021-09-21 | 5 | 5 | 4 | 143660.00 |
102 | 2021-09-22 | 4 | 4 | 4 | 177850.00 |
103 | 2021-09-23 | 2 | 2 | 4 | 75890.00 |
104 | 2021-09-24 | 4 | 4 | 4 | 89880.00 |
105 | 2021-10-04 | 1 | 1 | 4 | 120100.00 |
106 | 2021-10-04 | 2 | 2 | 4 | 119150.00 |
107 | 2021-09-25 | 4 | 4 | 4 | 124150.00 |
108 | 2021-10-06 | 2 | 2 | 4 | 155770.00 |
109 | 2021-09-26 | 3 | 3 | 4 | 153500.00 |
1010 | 2021-09-27 | 2 | 2 | 4 | 51950.00 |
2.11.2 代碼實現
select login.user_id,
register_date,
total_login_count,
login_count_2021,
order_count_2021,
order_amount_2021
from (
select user_id,
-- 首次登陸(註冊日期)
min(date_format(login_ts, 'yyyy-MM-dd')) register_date,
-- 累計登錄次數
count(1) total_login_count,
-- 2021年登錄次數
count(if(year(login_ts) = '2021', 1, null)) login_count_2021
from user_login_detail
group by user_id
) login
join
(
select user_id,
-- 下單次數
count(distinct(order_id)) order_count_2021,
-- 訂單金額
sum(total_amount) order_amount_2021
from order_info
where year(create_date) = '2021'
group by user_id
) oi
on login.user_id = oi.user_id;
2.12 查詢指定日期的全部商品價格
2.12.1 題目需求
從商品價格修改明細表(sku_price_modify_detail)中查詢2021-10-01的全部商品的價格,假設所有商品初始價格默認都是99。期望結果如下:
sku_id(商品id) | price(商品價格) |
---|---|
1 | 2000.00 |
2 | 10.00 |
3 | 5000.00 |
4 | 6000.00 |
5 | 500.00 |
6 | 2000.00 |
7 | 100.00 |
8 | 600.00 |
9 | 1000.00 |
10 | 90.00 |
11 | 66.00 |
12 | 20.00 |
2.12.2 代碼實現
某個商品在2021-10-01不一定會更改價格,需要獲得最近日期的價格(TopK問題)
select sku_info.sku_id,
nvl(new_price, 99) price
from sku_info
left join
(
select sku_id,
new_price
from (
select sku_id,
new_price,
change_date,
row_number() over (partition by sku_id order by change_date desc) rn
from sku_price_modify_detail
-- 過濾
where change_date <= '2021-10-01'
) t1
where rn = 1
) t2
on sku_info.sku_id = t2.sku_id;
2.13 即時訂單比例
2.13.1 題目需求
訂單配送中,如果期望配送日期和下單日期相同,稱爲即時訂單,如果期望配送日期和下單日期不同,稱爲計劃訂單。
請從配送信息表(delivery_info)中求出每個用戶的首單(用戶的第一個訂單)中即時訂單的比例,保留兩位小數,以小數形式顯示。期望結果如下:
percentage |
---|
0.5 |
2.13.2 代碼實現
每個用戶的首單(TopK問題)
select
cast(sum(if(order_date=custom_date,1,0))/count(*) as decimal(10,2)) percentage
from
(
select
delivery_id,
user_id,
order_date,
custom_date,
row_number() over (partition by user_id order by order_date) rn
from delivery_info
)t1
where rn=1;
2.14 向用戶推薦朋友收藏的商品
2.14.1 題目需求
現需要請向所有用戶推薦其朋友收藏但是用戶自己未收藏的商品,請從好友關係表(friendship_info)和收藏表(favor_info)中查詢出應向哪位用戶推薦哪些商品。期望結果如下:
1)部分結果展示
user_id(用戶id) | sku_id(應向該用戶推薦的商品id) |
---|---|
101 | 2 |
101 | 4 |
101 | 7 |
101 | 9 |
101 | 8 |
101 | 11 |
101 | 1 |
2.14.2 代碼實現
select
distinct t1.user_id,
friend_favor.sku_id
from
(
-- union 進行表冗餘,將雙向關係,表示爲單向關係
select
user1_id user_id,
user2_id friend_id
from friendship_info
union
select
user2_id,
user1_id
from friendship_info
)t1
-- 所有有收藏的朋友
left join favor_info friend_favor
on t1.friend_id=friend_favor.user_id
-- 朋友有收藏,但是用戶本身沒有收藏
left join favor_info user_favor
on t1.user_id=user_favor.user_id
and friend_favor.sku_id=user_favor.sku_id
where user_favor.sku_id is null;
2.15 查詢所有用戶的連續登錄兩天及以上的日期區間
2.15.1 題目需求
從登錄明細表(user_login_detail)中查詢出,所有用戶的連續登錄兩天及以上的日期區間,以登錄時間(login_ts)爲準。期望結果如下:
user_id(用戶id) | start_date(開始日期) | end_date(結束日期) |
---|---|---|
101 | 2021-09-27 | 2021-09-30 |
102 | 2021-10-01 | 2021-10-02 |
106 | 2021-10-04 | 2021-10-05 |
107 | 2021-10-05 | 2021-10-06 |
2.15.2 代碼實現
注意該題需要獲得的是時間區間,而不是用戶id。這裏不同於2.2查詢至少連續三天下單的用戶
select user_id,
min(login_date) start_date,
max(login_date) end_date
from (
select user_id,
login_date,
-- 差值
date_sub(login_date, rn) flag
from (
select user_id,
login_date,
-- 排序
row_number() over (partition by user_id order by login_date) rn
from (
-- 按用戶登錄日期去重
select user_id,
date_format(login_ts, 'yyyy-MM-dd') login_date
from user_login_detail
group by user_id, date_format(login_ts, 'yyyy-MM-dd')
) t1
) t2
) t3
group by user_id, flag
-- 具有相同的flag數大於等於2表明連續登錄兩天及以上
having count(*) >= 2;
2.16 男性和女性每日的購物總金額統計
2.16.1 題目需求
從訂單信息表(order_info)和用戶信息表(user_info)中,分別統計每天男性和女性用戶的訂單總金額,如果當天男性或者女性沒有購物,則統計結果爲0。期望結果如下:
create_date(日期) | total_amount_male(男性用戶總金額) | total_amount_female(女性用戶總金額) |
---|---|---|
2021-09-27 | 29000.00 | 0.00 |
2021-09-28 | 70500.00 | 0.00 |
2021-09-29 | 43300.00 | 0.00 |
2021-09-30 | 860.00 | 0.00 |
2021-10-01 | 0.00 | 171680.00 |
2021-10-02 | 0.00 | 76150.00 |
2021-10-03 | 89880.00 | 5910.00 |
2021-10-04 | 9390.00 | 120100.00 |
2021-10-05 | 109760.00 | 69850.00 |
2021-10-06 | 101070.00 | 54300.00 |
2021-10-07 | 54700.00 | 129480.00 |
2021-10-08 | 51950.00 | 24020.00 |
2.16.2 代碼實現
select create_date,
sum(if(gender = '男', total_amount, 0)) total_amount_male,
sum(if(gender = '女', total_amount, 0)) total_amount_female
from
order_info oi left join user_info ui on oi.user_id = ui.user_id
group by create_date;
2.17 訂單金額趨勢分析
2.17.1 題目需求
查詢截止每天的最近3天內的訂單金額總和以及訂單金額日平均值,保留兩位小數,四捨五入。期望結果如下:
create_date(日期) | total_3d(最近3日訂單金額總和) | avg_ad(最近3日訂單金額日平均值) |
---|---|---|
2021-09-27 | 29000.00 | 29000.00 |
2021-09-28 | 99500.00 | 49750.00 |
2021-09-29 | 142800.00 | 47600.00 |
2021-09-30 | 114660.00 | 38220.00 |
2021-10-01 | 215840.00 | 71946.67 |
2021-10-02 | 248690.00 | 82896.67 |
2021-10-03 | 343620.00 | 114540.00 |
2021-10-04 | 301430.00 | 100476.67 |
2021-10-05 | 404890.00 | 134963.33 |
2021-10-06 | 464470.00 | 154823.33 |
2021-10-07 | 519160.00 | 173053.33 |
2021-10-08 | 415520.00 | 138506.67 |
2.17.2 代碼實現
注意要求是3天內,不能用rows,而要用range
select create_date,
round(sum(total_amount_by_day) over (order by datediff(create_date,'2010-01-01') range BETWEEN 2 preceding and current row ),2) total_3d,
round(avg(total_amount_by_day) over (order by datediff(create_date,'2010-01-01') range BETWEEN 2 preceding and current row ), 2) avg_3d
from (
-- 聚集total_amount
select create_date,
sum(total_amount) total_amount_by_day
from order_info
group by create_date
) t1;
2.18 購買過商品1和商品2但是沒有購買商品3的顧客
2.18.1 題目需求
從訂單明細表(order_detail)中查詢出所有購買過商品1和商品2,但是沒有購買過商品3的用戶,期望結果如下:
user_id |
---|
103 |
105 |
2.18.2 代碼實現
高級函數
-
collect_set
對於非group by字段,用Hive的collect_set函數收集這些字段,返回一個數組;
-
array_contains
用於判定包含(array_contains)或不包含(!array_contains)關係。
select user_id
from (
select user_id,
collect_set(sku_id) skus
from order_detail od
left join
order_info oi
on od.order_id = oi.order_id
group by user_id
) t1
where array_contains(skus, '1')
and array_contains(skus, '2')
and !array_contains(skus, '3');
2.19 統計每日商品1和商品2銷量的差值
2.19.1 題目需求
從訂單明細表(order_detail)中統計每天商品1和商品2銷量(件數)的差值(商品1銷量-商品2銷量),期望結果如下:
create_date | diff |
---|---|
2021-09-27 | 2 |
2021-10-01 | -10 |
2021-10-02 | -49 |
2021-10-03 | 4 |
2021-10-04 | -55 |
2021-10-05 | -30 |
2021-10-06 | -49 |
2021-10-07 | -40 |
2021-10-08 | -24 |
2.19.2 代碼實現
select create_date,
sum(if(sku_id = '1', sku_num, 0)) - sum(if(sku_id = '2', sku_num, 0)) diff
from order_detail
where sku_id in ('1', '2')
group by create_date;
2.20 查詢出每個用戶的最近三筆訂單
2.20.1 題目需求
從訂單信息表(order_info)中查詢出每個用戶的最近三筆訂單,期望結果如下:
user_id | order_id | create_date |
---|---|---|
101 | 2 | 2021-09-28 |
101 | 3 | 2021-09-29 |
101 | 4 | 2021-09-30 |
102 | 5 | 2021-10-01 |
102 | 6 | 2021-10-01 |
102 | 8 | 2021-10-02 |
103 | 9 | 2021-10-02 |
103 | 10 | 2021-10-02 |
103 | 12 | 2021-10-03 |
104 | 13 | 2021-10-03 |
104 | 14 | 2021-10-03 |
104 | 15 | 2021-10-03 |
105 | 17 | 2021-10-04 |
105 | 18 | 2021-10-04 |
105 | 19 | 2021-10-04 |
106 | 22 | 2021-10-05 |
106 | 23 | 2021-10-05 |
106 | 24 | 2021-10-05 |
107 | 25 | 2021-10-05 |
107 | 27 | 2021-10-06 |
107 | 28 | 2021-10-06 |
108 | 29 | 2021-10-06 |
108 | 31 | 2021-10-07 |
108 | 32 | 2021-10-07 |
109 | 33 | 2021-10-07 |
109 | 35 | 2021-10-08 |
109 | 36 | 2021-10-08 |
1010 | 37 | 2021-10-08 |
1010 | 38 | 2021-10-08 |
2.20.2 代碼實現
TopK問題
select user_id,
order_id,
create_date
from (
select user_id
, order_id
, create_date
, row_number() over (partition by user_id order by create_date desc) rk
from order_info
) t1
where rk <= 3;
如果是詢出每個用戶的最近三個下單日期的所有訂單
則將row_number()
換成dense_rank()
窗口排序函數
-
row_number:在每個分組中,爲每行分配一個從1開始的唯一序列號,遞增,不考慮重複;(1234567……)
-
rank: 在每個分組中,爲每行分配一個從1開始的序列號,考慮重複,擠佔後續位置;(12225……)
-
dense_rank: 在每個分組中,爲每行分配一個從1開始的序列號,考慮重複,不擠佔後續位置(12223……)
2.21 查詢每個用戶登錄日期的最大空檔期
2.21.1 題目需求
從登錄明細表(user_login_detail)中查詢每個用戶兩個登錄日期(以login_ts爲準)之間的最大的空檔期。統計最大空檔期時,用戶最後一次登錄至今的空檔也要考慮在內,假設今天爲2021-10-10。期望結果如下:
user_id(用戶id) | max_diff(最大空檔期) |
---|---|
101 | 10 |
102 | 9 |
103 | 10 |
104 | 9 |
105 | 6 |
106 | 5 |
107 | 10 |
108 | 4 |
109 | 10 |
1010 | 12 |
2.21.2 代碼實現
select
user_id,
max(diff) max_diff
from
(
select
user_id,
-- 計算空擋
datediff(next_login_date,login_date) diff
from
(
select
user_id,
login_date,
-- 開窗獲得下次登錄日期
lead(login_date,1,'2021-10-10') over(partition by user_id order by login_date) next_login_date
from
(
-- 對登錄時間進行去重
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date
from user_login_detail
group by user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
)t2
)t3
group by user_id;
窗口函數lead
- 功能:用於從當前數據中基於當前行的數據向後偏移取值
- 語法:lead(colName,N,defautValue)
- colName:取哪一列的值
- N:向後偏移N行
- defaultValue:如果取不到返回的默認值
- 分析
當前數據中記錄了每個用戶每一次登陸的日期,一個用戶在一天只有1條信息,我們可以基於用戶的登陸信息,找到如下規律:
連續兩天登陸 : 用戶下次登陸時間 = 本次登陸以後的第二天
連續三天登陸 : 用戶下下次登陸時間 = 本次登陸以後的第三天
……依次類推。
我們可以對用戶ID進行分區,按照登陸時間進行排序,通過lead函數計算出用戶下次登陸時間,通過日期函數計算出登陸以後第二天的日期,如果相等即爲連續兩天登錄。
- 統計連續2天登錄
select
userid,
logintime,
-- 本次登陸日期的第二天
date_add(logintime,1) as nextday,
-- 按照用戶id分區,按照登陸日期排序,取下一次登陸時間,取不到就爲0
lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
with t1 as (
select
userid,
logintime,
-- 本次登陸日期的第二天
date_add(logintime,1) as nextday,
-- 按照用戶id分區,按照登陸日期排序,取下一次登陸時間,取不到就爲0
lead(logintime,1,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;
- 統計連續3天登錄
select
userid,
logintime,
-- 本次登陸日期的第三天
date_add(logintime,2) as nextday,
-- 按照用戶id分區,按照登陸日期排序,取下下一次登陸時間,取不到就爲0
lead(logintime,2,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
with t1 as (
select
userid,
logintime,
-- 本次登陸日期的第三天
date_add(logintime,2) as nextday,
-- 按照用戶id分區,按照登陸日期排序,取下下一次登陸時間,取不到就爲0
lead(logintime,2,0) over (partition by userid order by logintime) as nextlogin
from tb_login )
select distinct userid from t1 where nextday = nextlogin;
- 統計連續N天登錄
select
userid,
logintime,
-- 本次登陸日期的第N天
date_add(logintime,N-1) as nextday,
-- 按照用戶id分區,按照登陸日期排序,取下下一次登陸時間,取不到就爲0
lead(logintime,N-1,0) over (partition by userid order by logintime) as nextlogin
from tb_login;
2.22 查詢相同時刻多地登陸的用戶
2.22.1 題目需求
從登錄明細表(user_login_detail)中查詢在相同時刻,多地登陸(ip_address不同)的用戶,期望結果如下:
user_id (用戶id) |
---|
101 |
102 |
104 |
107 |
2.22.2 代碼實現
select
distinct t2.user_id
from
(
select
t1.user_id,
-- t1.max_logout is not null
-- t1.max_logout>t1.login_ts
if(t1.max_logout is null ,2,if(t1.max_logout<t1.login_ts,1,0)) flag
from
(
select
user_id,
login_ts,
logout_ts,
-- 當前的最大登錄時間
max(logout_ts)over(partition by user_id order by login_ts rows between unbounded preceding and 1 preceding) max_logout
from
user_login_detail
)t1
)t2
where
t2.flag=0
2.23 銷售額完成任務指標的商品
2.23.1 題目需求
商家要求每個商品每個月需要售賣出一定的銷售總額
假設1號商品銷售總額大於21000,2號商品銷售總額大於10000,其餘商品沒有要求
請寫出SQL從訂單詳情表中(order_detail)查詢連續兩個月銷售總額大於等於任務總額的商品
結果如下:
sku_id(商品id) |
---|
1 |
2.23.2 代碼實現及步驟
-- 判斷是否爲連續兩個月
select
distinct t3.sku_id
from
(
select
t2.sku_id,
count(*)over(partition by t2.sku_id,t2.rymd) cn
from
(
select
t1.sku_id,
add_months(t1.ymd,-row_number()over(partition by t1.sku_id order by t1.ymd)) rymd
from
(
-- 求出1號商品和2號商品每個月的購買總額 並過濾掉沒有滿足指標的商品
select
sku_id,
concat(substring(create_date,0,7),'-01') ymd,
sum(price*sku_num) sku_sum
from
order_detail
where
sku_id=1 or sku_id=2
group by
sku_id,substring(create_date,0,7)
having
(sku_id=1 and sku_sum>=21000) or (sku_id=2 and sku_sum>=10000)
)t1
)t2
)t3
where
t3.cn>=2
2.24 根據商品銷售情況進行商品分類
2.24.1 題目需求
從訂單詳情表中(order_detail)對銷售件數對商品進行分類,0-5000爲冷門商品,5001-19999位一般商品,20000往上爲熱門商品,並求出不同類別商品的數量
結果如下:
Category(類型) | Cn(數量) |
---|---|
一般商品 | 1 |
冷門商品 | 10 |
熱門商品 | 1 |
2.24.2 代碼實現
select
t2.category,
count(*) cn
from
(
select
t1.sku_id,
case
when t1.sku_sum >=0 and t1.sku_sum<=5000 then '冷門商品'
when t1.sku_sum >=5001 and t1.sku_sum<=19999 then '一般商品'
when t1.sku_sum >=20000 then '熱門商品'
end category
from
(
select
sku_id,
sum(sku_num) sku_sum
from
order_detail
group by
sku_id
)t1
)t2
group by t2.category
2.25 各品類銷量前三的所有商品
2.25.1 題目需求
從訂單詳情表中(order_detail)和商品(sku_info)中查詢各個品類銷售數量前三的商品。如果該品類小於三個商品,則輸出所有的商品銷量。
結果如下:
Sku_id(商品id) | Category_id(品類id) |
---|---|
2 | 1 |
4 | 1 |
1 | 1 |
8 | 2 |
7 | 2 |
5 | 2 |
12 | 3 |
11 | 3 |
10 | 3 |
2.25.2 代碼實現
select
t2.sku_id,
t2.category_id
from
(
select
t1.sku_id,
si.category_id,
rank()over(partition by category_id order by t1.sku_sum desc) rk
from
(
-- 去重 匯聚
select
sku_id,
sum(sku_num) sku_sum
from
order_detail
group by
sku_id
)t1
join
sku_info si
on
t1.sku_id=si.sku_id
)t2
where
t2.rk<=3;
2.26 各品類中商品價格的中位數
2.26.1 題目需求
從商品(sku_info)中求中位數如果是偶數則輸出中間兩個值的平均值,如果是奇數,則輸出中間數即可。
結果如下:
Category_id(品類id) | Medprice(中位數) |
---|---|
1 | 3500.0 |
2 | 1250.0 |
3 | 510.0 |
2.26.2 代碼實現
select category_id,
cast(avg(price) as decimal(10,2) ) as medprice
from(
-- 求個每個品類 價格排序 商品數量
select
sku_id,
category_id,
price,
row_number()over(partition by category_id order by price desc) rn,
count(1)over(partition by category_id) cnt
from sku_info
)t1
-- 偶數取中間, 奇數取中間值
where if(cnt %2 = 0,rn in(cnt/2,cnt/2+1), rn = (cnt+1)/2)
group by category_id
2.27 找出銷售額連續3天超過100的商品
2.27.1 題目需求
從訂單詳情表(order_detail)中找出銷售額連續3天超過100的商品
結果如下:
Sku_id(商品id) |
---|
1 |
10 |
11 |
12 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
2.27.2代碼實現
統計相同日期差值的個數,同2.2的方案2思路一致
select
distinct sku_id
from(
select sku_id
from
(
select
sku_id,
create_date,
date_sub(create_date,row_number()over(partition by sku_id order by create_date)) diff
from
(
select
sku_id,
create_date,
sum(price*sku_num) sku_sum
from
order_detail
group by
sku_id,create_date
having
sku_sum>=100
)t1 -- 每個商品每天的銷售總額
)t2
group by sku_id,diff
-- 判斷連續三天以上
having count(diff)>=3
)t3
2.28 查詢有新註冊用戶的當天的新用戶數量、新用戶的第一天留存率
2.28.1 題目需求
從用戶登錄明細表(user_login_detail)中首次登錄算作當天新增,第二天也登錄了算作一日留存
結果如下:
first_login(註冊時間) | Register(新增用戶數) | Retention(留存率) |
---|---|---|
2021-09-21 | 1 | 0.0 |
2021-09-22 | 1 | 0.0 |
2021-09-23 | 1 | 0.0 |
2021-09-24 | 1 | 0.0 |
2021-09-25 | 1 | 0.0 |
2021-09-26 | 1 | 0.0 |
2021-09-27 | 1 | 0.0 |
2021-10-04 | 2 | 0.5 |
2021-10-06 | 1 | 0.0 |
2.28.2 代碼實現
-- 新增數量和留存率
select
t3.first_login,
t3.register,
cast (t3.remain_1/t3.registeras as decimal(10,2)) as retention
from
(
-- 每個用戶首次登錄時間 和 第二天是否登錄 並看每天新增和留存數量
select
t1.first_login,
count(t1.user_id) register,
count(t2.user_id) remain_1
from
(
-- 首次登錄時間
select
user_id,
date_format(min(login_ts),'yyyy-MM-dd') first_login
from
user_login_detail
group by
user_id
)t1
left join
user_login_detail t2
on
t1.user_id=t2.user_id and datediff(date_format(t2.login_ts,'yyyy-MM-dd'),t1.first_login)=1
group by
t1.first_login
)t3
2.29 求出商品連續售賣的時間區間
2.29.1 題目需求
從訂單詳情表(order_detail)中,求出商品連續售賣的時間區間
結果如下(截取部分):
Sku_id(商品id) | Start_date(起始時間) | End_date(結束時間) |
---|---|---|
1 | 2021-09-27 | 2021-09-27 |
1 | 2021-09-30 | 2021-10-01 |
1 | 2021-10-03 | 2021-10-08 |
10 | 2021-10-02 | 2021-10-03 |
10 | 2021-10-05 | 2021-10-08 |
11 | 2021-10-02 | 2021-10-08 |
12 | 2021-09-30 | 2021-09-30 |
12 | 2021-10-02 | 2021-10-06 |
12 | 2021-10-08 | 2021-10-08 |
2.29.2 代碼實現
-- 拿到每次售賣的區間
select
distinct
sku_id,
first_value(t1.create_date)over(partition by t1.sku_id,t1.ddrk order by t1.create_date rows between unbounded preceding and unbounded following) start_date,
last_value(t1.create_date)over(partition by t1.sku_id,t1.ddrk order by t1.create_date rows between unbounded preceding and unbounded following) end_date
from
(
-- 每個商品售賣的日期以及拿到按排序後日期的差值
select
sku_id,
create_date,
date_sub(create_date,rank()over(partition by sku_id order by create_date)) ddrk
from
order_detail
group by
sku_id,create_date
)t1
新學到的窗口函數
-
first_value取分組內排序後,截止到當前行,第一個值
-
last_value取分組內排序後,截止到當前行,最後一個值
2.30 登錄次數及交易次數統計
2.30.1 題目需求
分別從登陸明細表(user_login_detail)和配送信息表中用戶登錄時間和下單時間統計登陸次數和交易次數
結果如下(截取部分):
User_id(用戶id) | Login_date(登錄時間) | login_count(登陸次數) | order_count(交易次數) |
---|---|---|---|
101 | 2021-09-21 | 1 | 0 |
101 | 2021-09-27 | 1 | 1 |
101 | 2021-09-28 | 1 | 1 |
101 | 2021-09-29 | 1 | 1 |
101 | 2021-09-30 | 1 | 1 |
1010 | 2021-09-27 | 1 | 0 |
1010 | 2021-10-09 | 1 | 0 |
102 | 2021-09-22 | 1 | 0 |
102 | 2021-10-01 | 2 | 3 |
2.30.2 代碼實現
-- 拿到每個用戶每天的交易次數
select
t1.user_id,
t1.login_date,
collect_set(t1.login_count)[0] login_count ,
count(di.user_id) order_count
from
(
-- 拿到每個用戶每天的登錄次數
-- 去重 聚合
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date,
count(*) login_count
from
user_login_detail
group by
user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
left join
delivery_info di
on
t1.user_id=di.user_id and t1.login_date=di.order_date
group by
t1.user_id,t1.login_date
2.31 按年度列出每個商品銷售總額
2.31.1 題目需求
從訂單明細表(order_detail)中列出每個商品每個年度的購買總額
結果如下(截取部分):
Sku_id(商品id) | Year_date(年份) | Sku_sum(銷售總額) |
---|---|---|
1 | 2021 | 102000.00 |
10 | 2021 | 29900.00 |
11 | 2021 | 16000.00 |
12 | 2021 | 413640.00 |
2 | 2021 | 60440.00 |
3 | 2021 | 180000.00 |
4 | 2021 | 318000.00 |
5 | 2021 | 121000.00 |
6 | 2021 | 64000.00 |
7 | 2021 | 25200.00 |
8 | 2021 | 151800.00 |
9 | 2021 | 194000.00 |
2.31.2 代碼實現
select
sku_id,
year(create_date) year_date,
sum(price*sku_num) sku_sum
from
order_detail
group by
sku_id,year(create_date)
2.32. 某周內每件商品每天銷售情況
2.32.1 題目需求
從訂單詳情表(order_detail)中查詢2021年9月27號-2021年10月3號這一週所有商品每天銷售情況。
結果如下:
Sku_id(商品id) | Monday | Tuesday | Wednesday | Thursday | Friday | Saturday | Sunday |
---|---|---|---|---|---|---|---|
1 | 0 | 0 | 9 | 8 | 0 | 4 | 2 |
10 | 0 | 0 | 0 | 0 | 48 | 69 | 0 |
11 | 0 | 0 | 0 | 0 | 15 | 61 | 0 |
12 | 0 | 0 | 43 | 0 | 31 | 20400 | 0 |
2 | 0 | 0 | 0 | 18 | 5800 | 0 | 0 |
3 | 0 | 0 | 0 | 6 | 0 | 1 | 5 |
4 | 9 | 0 | 0 | 8 | 1 | 5 | 0 |
5 | 33 | 0 | 0 | 0 | 24 | 47 | 0 |
6 | 0 | 0 | 0 | 1 | 5 | 8 | 0 |
7 | 0 | 37 | 0 | 17 | 0 | 20 | 0 |
8 | 0 | 46 | 0 | 48 | 39 | 0 | 0 |
9 | 0 | 12 | 0 | 45 | 0 | 0 | 0 |
2.32.2 代碼實現
select
sku_id,
sum(if(dayofweek(create_date)=2,sku_num,0)) Monday,
sum(if(dayofweek(create_date)=3,sku_num,0)) Tuesday,
sum(if(dayofweek(create_date)=4,sku_num,0)) Wednesday,
sum(if(dayofweek(create_date)=5,sku_num,0)) Thursday,
sum(if(dayofweek(create_date)=6,sku_num,0)) Friday,
sum(if(dayofweek(create_date)=7,sku_num,0)) Saturday,
sum(if(dayofweek(create_date)=1,sku_num,0)) Sunday
from
order_detail
where
create_date>='2021-09-27' and create_date<='2021-10-03'
group by
sku_id
- dayofweek() 獲取一個日期是星期幾的方法
2.33 查看每件商品的售價漲幅情況
2.33.1 題目需求
從商品價格變更明細表(sku_price_modify_detail),得到最近一次價格的漲幅情況,並按照漲幅升序排序。
結果如下:
Sku_id(商品id) | Price_change(漲幅) |
---|---|
8 | -200.00 |
9 | -100.00 |
2 | -70.00 |
11 | -16.00 |
12 | -15.00 |
3 | 1.00 |
5 | 10.00 |
10 | 10.00 |
7 | 12.00 |
6 | 12.00 |
1 | 100.00 |
4 | 400.00 |
2.33.2 代碼實現
-- 對每個商品按照修改日期倒序排序 並求出差值
select
t1.sku_id,
t1.price_change
from
(
-- 最近一次修改的價格
select
sku_id,
new_price-lead(new_price,1,0)over(partition by sku_id order by change_date desc) price_change,
rank()over(partition by sku_id order by change_date desc) rk
from
sku_price_modify_detail
)t1
where
rk=1
order by
t1.price_change
2.34 銷售訂單首購和次購分析
2.34.1 題目需求
通過商品信息表(sku_info)訂單信息表(order_info)訂單明細表(order_detail)分析如果有一個用戶成功下單兩個及兩個以上的購買成功的手機訂單(購買商品爲xiaomi 10,apple 12,小米13)那麼輸出這個用戶的id及第一次成功購買手機的日期和第二次成功購買手機的日期,以及購買手機成功的次數。
結果如下:
User_id(用戶id) | First_date(首次時間) | Last_value(末次時間) | Cn(購買次數) |
---|---|---|---|
101 | 2021-09-27 | 2021-09-28 | 3 |
1010 | 2021-10-08 | 2021-10-08 | 2 |
102 | 2021-10-01 | 2021-10-01 | 3 |
103 | 2021-09-30 | 2021-10-02 | 2 |
104 | 2021-10-03 | 2021-10-03 | 3 |
105 | 2021-10-04 | 2021-10-04 | 2 |
106 | 2021-10-04 | 2021-10-05 | 3 |
107 | 2021-10-05 | 2021-10-05 | 3 |
108 | 2021-10-06 | 2021-10-06 | 3 |
109 | 2021-10-07 | 2021-10-07 | 3 |
2.34.2 代碼實現
select
distinct oi.user_id,
first_value(od.create_date)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following ) first_date,
last_value(od.create_date)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following ) last_date,
count(*)over(partition by oi.user_id order by od.create_date rows between unbounded preceding and unbounded following) cn
from
order_info oi
join
order_detail od
on
oi.order_id=od.order_id
join
sku_info si
on
od.sku_id=si.sku_id
where
si.name in('xiaomi 10','apple 12','xiaomi 13')
2.35 同期商品售賣分析表
2.35.1 題目需求
從訂單明細表(order_detail)中。
求出同一個商品在2021年和2022年中同一個月的售賣情況對比。
結果如下(截取部分):
Sku_id(商品id) | Month(月份) | 2020_skusum(2020銷售量) | 2021_skusum(2021銷售量) |
---|---|---|---|
1 | 9 | 0 | 11 |
1 | 10 | 2 | 38 |
1 | 10 | 94 | 205 |
11 | 10 | 95 | 225 |
12 | 9 | 0 | 43 |
12 | 10 | 83 | 20556 |
2 | 10 | 26 | 6018 |
3 | 9 | 0 | 5 |
3 | 10 | 1 | 30 |
4 | 9 | 0 | 9 |
2.35.2 代碼實現
select
if(t1.sku_id is null,t2.sku_id,t1.sku_id) as sku_id,
month(if(t1.ym is null,t2.ym,t1.ym)) as month,
if(t1.sku_sum is null ,0 ,t1.sku_sum) as 2020_skusum,
if(t2.sku_sum is null ,0 ,t2.sku_sum) as 2021_skusum
from
(
-- 2020年銷售量
select
sku_id,
concat(date_format(create_date,'yyyy-MM'),'-01') ym,
sum(sku_num) sku_sum
from
order_detail
where
year(create_date)=2020
group by
sku_id,date_format(create_date,'yyyy-MM')
)t1
full join
(
-- 2021年銷售量
select
sku_id,
concat(date_format(create_date,'yyyy-MM'),'-01') ym,
sum(sku_num) sku_sum
from
order_detail
where
year(create_date)=2021
group by
sku_id,date_format(create_date,'yyyy-MM')
)t2
on
t1.sku_id=t2.sku_id and month(t1.ym) = month(t2.ym)
2.36 國慶期間每個品類的商品的收藏量和購買量
2.36.1 題目需求
從訂單明細表(order_detail)和收藏信息表(favor_info)統計2021國慶期間,每個商品總收藏量和購買量
結果如下:
Sku_id | Sku_sum(購買量) | Favor_cn(收藏量) |
---|---|---|
1 | 38 | 1 |
10 | 205 | 2 |
11 | 225 | 2 |
12 | 20556 | 0 |
2 | 6018 | 1 |
3 | 30 | 0 |
4 | 44 | 2 |
5 | 209 | 1 |
6 | 26 | 1 |
7 | 180 | 1 |
8 | 148 | 0 |
9 | 182 | 1 |
2.36.2 代碼實現
select
t1.sku_id,
t1.sku_sum,
t2.favor_cn
from
(
-- 國慶期間 每個商品的總購買量
select
sku_id,
sum(sku_num) sku_sum
from
order_detail
where
create_date>='2021-10-01' and create_date<='2021-10-07'
group by
sku_id
)t1
join
(
-- 國慶期間 每個商品的總收藏量
select
sku_id,
count(*) favor_cn
from
favor_info
where
create_date>='2021-10-01' and create_date<='2021-10-07'
group by
sku_id
)t2
on
t1.sku_id=t2.sku_id
2.37 統計活躍間隔對用戶分級結果
2.37.1 題目需求
用戶等級:
忠實用戶:近7天活躍且非新用戶
新晉用戶:近7天新增
沉睡用戶:近7天未活躍但是在7天前活躍
流失用戶:近30天未活躍但是在30天前活躍
假設今天是數據中所有日期的最大值,從用戶登錄明細表中的用戶登錄時間給各用戶分級,求出各等級用戶的人數
結果如下:
Level(用戶等級) | Cn(用戶數量) |
---|---|
忠實用戶 | 6 |
新增用戶 | 3 |
沉睡用戶 | 1 |
2.37.2 代碼實現
select
t2.level,
count(*) as cn
from
(
select
uld.user_id,
case
when (date_format(max(uld.login_ts),'yyyy-MM-dd') <=date_sub(today, 30))
then '流失用戶'-- 最近登錄時間三十天前
when (date_format(min(uld.login_ts),'yyyy-MM-dd') <=date_sub(today, 7) and date_format(max(uld.login_ts),'yyyy-MM-dd') >=date_sub(today, 7))
then '忠實用戶' -- 最早登陸時間是七天前,並且最近七天登錄過
when (date_format(min(uld.login_ts),'yyyy-MM-dd') >=date_sub(today, 7))
then '新增用戶' -- 最早登錄時間是七天內
when (date_format(min(uld.login_ts),'yyyy-MM-dd') <= date_sub(today, 7) and date_format(max(uld.login_ts),'yyyy-MM-dd') <= date_sub(today, 7))
then '沉睡用戶'-- 最早登陸時間是七天前,最大登錄時間也是七天前
end level
from
user_login_detail uld
join
(
select
date_format(max(login_ts),'yyyy-MM-dd') today
from
user_login_detail
)t1
on
1=1
group by
uld.user_id,t1.today
)t2
group by
t2.level
2.38 連續簽到領金幣數
2.38.1 題目需求
用戶每天簽到可以領1金幣,並可以累計簽到天數,連續簽到的第3、7天分別可以額外領2和6金幣。
每連續簽到7天重新累積簽到天數。
從用戶登錄明細表中求出每個用戶金幣總數,並按照金幣總數倒序排序
結果如下:
User_id(用戶id) | Sum_coin_cn(金幣總數) |
---|---|
101 | 7 |
109 | 3 |
107 | 3 |
102 | 3 |
106 | 2 |
104 | 2 |
103 | 2 |
1010 | 2 |
108 | 1 |
105 | 1 |
2.38.2 代碼實現
兩個難點:
-
如何確定簽到日期是否連續?
-
如何確定每一次簽到獲取金幣的數量?
-- 求出每個用戶的金幣總數
select
t3.user_id,
sum(t3.coin_cn) sum_coin_cn
from
(
-- 求出金幣數量,以及簽到獎勵的金幣數量
select
t2.user_id,
max(t2.counti_cn)+sum(if(t2.counti_cn%3=0,2,0))+sum(if(t2.counti_cn%7=0,6,0)) coin_cn
from
(
-- 求連續並標誌是連續的第幾天
select
t1.user_id,
t1.login_date,
date_sub(t1.login_date,t1.rk) login_date_rk,
count(*)over(partition by t1.user_id, date_sub(t1.login_date,t1.rk) order by t1.login_date) counti_cn
from
(
-- 窗口排序
select
user_id,
date_format(login_ts,'yyyy-MM-dd') login_date,
rank()over(partition by user_id order by date_format(login_ts,'yyyy-MM-dd')) rk
from
user_login_detail
group by
user_id,date_format(login_ts,'yyyy-MM-dd')
)t1
)t2
group by
t2.user_id,t2.login_date_rk
)t3
group by
t3.user_id
order by
sum_coin_cn desc
2.39 國慶期間的7日動銷率和滯銷率
2.39.1 題目需求
動銷率定義爲品類商品中一段時間內有銷量的商品佔當前已上架總商品數的比例(有銷量的商品/已上架總商品數)。
滯銷率定義爲品類商品中一段時間內沒有銷量的商品佔當前已上架總商品數的比例。(沒有銷量的商品 / 已上架總商品數)。
只要當天任一店鋪有任何商品的銷量就輸出該天的結果
從訂單明細表(order_detail)和商品信息表(sku_info)表中求出國慶7天每天每個品類的商品的動銷率和滯銷率
結果如下(截取部分):
Category_id(品類id) | 1號(動銷) | 1號(滯銷) | 2號(動銷) | 2號(滯銷) | 3號(動銷) | 3號(滯銷) |
---|---|---|---|---|---|---|
1 | 1.0 | 0.0 | 0.5 | 0.5 | 0.75 | 0.25 |
2 | 0.75 | 0.25 | 0.75 | 0.25 | 0.75 | 0.25 |
3 | 0.25 | 0.75 | 0.75 | 0.25 | 0.75 | 0.25 |
2.39.2 代碼實現
-- 每一天的動銷率 和 滯銷率
select
t2.category_id,
cast(t2.`第1天`/t3.cn as decimal(10,2)) as first_sale_rate,
cast(1-t2.`第1天`/t3.cn as decimal(10,2)) as first_unsale_rate,
cast(t2.`第2天`/t3.cn as decimal(10,2)) as second_sale_rate,
cast(1-t2.`第2天`/t3.cn as decimal(10,2))as second_unsale_rate,
cast(t2.`第3天`/t3.cn as decimal(10,2))as third_sale_rate,
cast(1-t2.`第3天`/t3.cn as decimal(10,2))as third_unsale_rate,
cast(t2.`第4天`/t3.cn as decimal(10,2))as fourth_sale_rate,
cast(1-t2.`第4天`/t3.cn as decimal(10,2))as fourth_unsale_rate,
cast(t2.`第5天`/t3.cn as decimal(10,2))as fifth_sale_rate,
cast(1-t2.`第5天`/t3.cn as decimal(10,2))as fifth_unsale_rate,
cast(t2.`第6天`/t3.cn as decimal(10,2))as sixth_sale_rate,
cast(1-t2.`第6天`/t3.cn as decimal(10,2))as sixth_unsale_rate,
cast(t2.`第7天`/t3.cn as decimal(10,2))as seventh_sale_rate,
cast(1-t2.`第7天`/t3.cn as decimal(10,2))as seventh_unsale_rate
from
(
-- 國慶每一天 每個商品品類有多少商品被銷售了
select
t1.category_id,
sum(if(t1.create_date='2021-10-01',1,0)) `第1天`,
sum(if(t1.create_date='2021-10-02',1,0)) `第2天`,
sum(if(t1.create_date='2021-10-03',1,0)) `第3天`,
sum(if(t1.create_date='2021-10-04',1,0)) `第4天`,
sum(if(t1.create_date='2021-10-05',1,0)) `第5天`,
sum(if(t1.create_date='2021-10-06',1,0)) `第6天`,
sum(if(t1.create_date='2021-10-07',1,0)) `第7天`
from
(
select
distinct
si.category_id,
od.create_date,
si.name
from
order_detail od
join
sku_info si
on
od.sku_id=si.sku_id
where
od.create_date>='2021-10-01' and od.create_date<='2021-10-07'
)t1
group by
t1.category_id
)t2
join
(
select
category_id,
count(*) cn
from
sku_info
group by
category_id
)t3
on
t2.category_id=t3.category_id
2.40 同時在線最多的人數
2.40.1 題目需求
根據用戶登錄明細表(user_login_detail),求出平臺同時在線最多的人數。
結果如下:
Cn(人數) |
---|
7 |
2.40.2 代碼實現
-
時間有交集,纔會出現某一是個用戶同時在線
-
登入+1
-
登出-1
-- 拿到最大值 就是同時在線最多人數
select
max(sum_l_time) as cn
from
(
-- 按照時間求和
select
-- 窗口內求和
sum(flag)over(order by t1.l_time rows between unbounded preceding and current row) sum_l_time
from
(
-- 登錄標記1 登出標記-1
select
login_ts l_time,
1 flag
from
user_login_detail
union
select
logout_ts l_time,
-1 flag
from
user_login_detail
)t1
)t2
當order by後面的rows/range缺失時,默認是range between unbounded preceding and current row
rows是基於行數,range是基於值的大小
- range是邏輯窗口,是指定當前行對應值的範圍取值,列數不固定,只要行值在範圍內,對應列都包含在內
- rows是物理窗口,即根據order by 子句排序後,取的前N行及後N行的數據計算(與當前行的值無關,只與排序後的行號相關)
如 row 2 preceding,表示取前N行的數據計算,如 range 2 preceding,取前N行與當前行的差值不超過2的數據計算