clickhouse sql優化,減少查詢次數提升效率

業務場景:計算sales/count/sku的值、同比、環比
用開窗函數neighbor計算yoy同比和chain環比
// 原本計算同比和環比需要使用三條sql聚合後相除
SELECT toStartOfYear(toDate('2020-05-01')) AS date, 
    ROUND(SUM(sales), 2) AS sales, 
    SUM(count) AS count, 
    COUNT(DISTINCT product_id) AS sku
FROM mars_rc_roma_wide
WHERE (category_id IN (50015380))
        AND month >= '2020-01-01'
        AND month <= '2020-05-01'
GROUP BY  date


SELECT toStartOfYear(toDate('2019-05-01')) AS date, 
    ROUND(SUM(sales), 2) AS sales, 
    SUM(count) AS count, 
    COUNT(DISTINCT product_id) AS sku
FROM tmall_wide_idx2
WHERE (category_id IN (50010794))
        AND month >= '2019-01-01'
        AND month <= '2019-05-01'
GROUP BY  date

SELECT toStartOfYear(toDate('2019-03-01')) AS date, 
    ROUND(SUM(sales), 2) AS sales, 
    SUM(count) AS count, 
    COUNT(DISTINCT product_id) AS sku
FROM mars_rc_roma_wide
WHERE (category_id IN (50015380))
        AND month >= '2019-01-01'
        AND month <= '2019-03-01'
GROUP BY  date


// 使用開窗函數後將其合併成一條查詢
SELECT ROUND(SUM(if(month >= '2020-01-01’ AND month <= '2020-05-01', sales, 0)), 2) AS sales_last, 
    SUM(if(month >= '2020-01-01’ AND month <= '2020-05-01', count, 0)) AS count_last, 
    COUNT(DISTINCT(if(month >= '2020-01-01’ AND month <= '2020-05-01', product_id, 0))) AS sku_last, 
    (sales_last / ROUND(SUM(if(month >= '2019-01-01' AND month <= '2019-05-01', sales, 0)), 2) -1) * 100 AS sales_yoy, 
    (count_last / SUM(if(month >= '2019-01-01' AND month <= '2019-05-01', count, 0)) -1) * 100 AS count_yoy,                  
    (sku_last / COUNT(DISTINCT(if(month >= '2019-01-01' AND month <= '2019-05-01', product_id, 0))) -1) * 100 AS sku_yoy, 
    (sales_last / ROUND(SUM(if(month >= '2019-01-01' AND month <= '2019-03-01', sales, 0)), 2) -1) * 100 AS sales_ratio, 
    (count_last / SUM(if(month >= '2019-01-01' AND month <= '2019-03-01', count, 0)) -1) * 100 AS count_ratio, 
    (sku_last / COUNT(DISTINCT(if(month >= '2019-01-01' AND month <= '2019-03-01', product_id, 0))) -1) * 100 AS sku_ratio
FROM mars_rc_roma_wide
WHERE (category_id IN (50015380))


// 注意⚠️:在計算sales銷售額和count銷量時,使用的是SUM;計算sku時,使用的是count
// COUNT(DISTINCT(if(month >= '2019-01-01' AND month <= '2019-05-01', product_id, 0)))
// 這裏如果對精度要求很高的話,count(if(clause, a, b))中的b不要寫0或者任何有意義的數
// 因爲不管b是標成0還是-999,count後都會顯示成1,這樣沒辦法區分是業務值爲1還是0。比較好的方法是寫成NULL,這樣得到的值會是[NULL]

 

 
 
 
業務場景:clickhouse中array(T)中某個字符串類型的值,進行數值區間的計算
// 這裏有n個區間就會進行 n*2 次查詢
SELECT '100+' AS attr_value, SUM(sales) AS sales, SUM(count) AS count
FROM
    (SELECT LOWER(`a.attr_name`) AS attr_name,
         toUInt64(toFloat64OrZero(a.attr_value)) AS value,
         ROUND(SUM(sales), 2) AS sales,
         SUM(count) AS count
    FROM mars_rc_roma_wide ARRAY
    JOIN attributes AS a
    WHERE (category_id IN (50015380))
            AND month >= '2019-01-01'
            AND month <= '2019-05-01'
            AND a.attr_name = 'mars_rc_roma_unit_price_kg'
            AND a.attr_value != ''
    GROUP BY  `a.attr_name`, `a.attr_value`
    ORDER BY  sales DESC)
WHERE (value >= 100)

// 把原本多條 multiprocessing 的多線程函數不同的值,放到一起查詢
// 這裏原本有n個區間 [duration1, duration2],將區間前後通過其index進行命名取出
// 改進後將查詢縮減成一次,但需要注意的是,if()條件中用到的值(month)需要再子查詢中先查出來,以及必須先在子查詢進行聚合,才能使用這樣的優化
SELECT 'mars_mwc_roma_unit_price_kg' AS attr_value,
    SUM(if((value > 0 AND value < 20) AND month >= '2020-01-01' AND month <= '2020-05-01' , sales, 0)) AS s0, 
    (s0 / SUM(if((value > 0 AND value < 20) AND month >= '2019-01-01' AND month <= '2019-05-01' , sales, 0)) - 1) * 100 AS s0_ratio, 
    (s0 / SUM(if(month >= '2020-01-01' AND month <= '2020-05-01' , sales, 0))) * 100 AS s0_share
FROM
    (SELECT toUInt64(toFloat64OrZero(a.attr_value)) AS value,
         SUM(sales) AS sales,
         month
    FROM mars_mwc_roma_wide ARRAY
    JOIN attributes AS a
    WHERE (category_id IN (1))
            AND (price >= 0)
            AND (size >= 0)
            AND a.attr_name = 'mars_mwc_roma_unit_price_kg'
            AND a.attr_value != ''
    GROUP BY  `a.attr_name`, `a.attr_value`, month
    ORDER BY  sales DESC)

 

 
 
 
業務場景:需要計算各個詞的 tgi 的值。,主要是爲了觀察熱詞在該品牌的熱度,並與其在該品類的熱度進行比較。
解釋TGI:即Target Group Index(目標羣體指數) 
TGI指數= [目標羣體中具有某一特徵的羣體所佔比例/總體中具有相同特徵的羣體所佔比例]*標準數100。
例如,在15-24歲的人羣中,有8.9%的人過去一年內去過某電影網站看電影,而在總體人羣中,去過該電影網站看電影的人數比例爲6.6%,則該電影網站在15-24歲人羣中的TGI指數是134.9(8.9%/6.6%×100),這說明,該電影網站主要定位在15-24歲的人羣中。其數額越大,就表明目標羣體吻合度度就越高。
TGI指數表徵不同特徵用戶關注問題的差異情況,其中TGI指數等於100表示平均水平,高於100,代表該類用戶對某類問題的關注程度高於整體水平。

// 查看整體的 sales_in_category_brand 和 sales_in_category
// 1:sql_sales_in_category_brand
SELECT ROUND(SUM(sales), 2) AS sales
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
        AND LOWER(`brand_name`) = LOWER('HUAWEI\/\華\爲')
        AND (price >= 0)
        AND (size >= 0)
        AND `month` >= '2020-01-01'
        AND `month` <= '2020-05-01’ 

// 2:sql_sales_in_category
SELECT ROUND(SUM(sales), 2) AS sales
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
        AND (price >= 0)
        AND (size >= 0)
        AND `month` >= '2020-01-01'
        AND `month` <= '2020-05-01'


// 上面那兩種寫法是爲了便於理解,實質上對應的是這樣一條sql,直接查出兩個值
SELECT ROUND(SUM(sales), 2) AS category_sales,
         ROUND(SUM(if(LOWER(brand_name)=LOWER('HUAWEI\/\華\爲'), sales, 0)), 2) AS brand_sales
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
        AND (price >= 0)
        AND (size >= 0)
        AND `month` >= '2020-01-01'
        AND `month` <= '2020-05-01’


// 優化的重點在於對應的品牌關鍵詞的不同。
// 這些詞的來源於我們對商品標題的切詞,同樣放在Array(T)中,需要時可直接查出)
// 原本的寫法是帶品牌查一次,不帶品牌查一次;將兩次的結果join;若需要觀察40個詞,相當於進行了80個查詢

SELECT ROUND(SUM(sales), 2) AS sales
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
        AND LOWER(`brand_name`) = LOWER('HUAWEI\/\華\爲')
        AND LOWER(title) LIKE LOWER('%%\集\顯%%')
        AND (price >= 0)
        AND (size >= 0)
        AND `month` >= '2020-01-01'
        AND `month` <= '2020-05-01'

SELECT ROUND(SUM(sales), 2) AS sales
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
        AND (price >= 0)
        AND (size >= 0)
        AND LOWER(title) LIKE LOWER('%%\集\顯%%')
        AND `month` >= '2020-01-01'
        AND `month` <= '2020-05-01’

// 優化寫法是合併,不管查多少個詞,就都對應一個查詢;
// 還有個注意點是需要在條件中加上multisearchany,減小查詢範圍
SELECT
ROUND(SUM(if(LOWER(brand_name)=LOWER('HUAWEI\/\華\爲') AND LOWER(title) LIKE LOWER('%%\集\顯%%'), sales, 0)), 2) AS sales1_brands,
ROUND(SUM(if(LOWER(title) LIKE LOWER('%%\集\顯%%'), sales, 0)), 2) AS sales1_no_brands,
ROUND(SUM(if(LOWER(brand_name)=LOWER('HUAWEI\/\華\爲') AND LOWER(title) LIKE LOWER('%%\聲\紋%%'), sales, 0)), 2) AS sales2_brands,
ROUND(SUM(if(LOWER(title) LIKE LOWER('%%\聲\紋%%'), sales, 0)), 2) AS sales2_no_brands,
ROUND(SUM(if(LOWER(brand_name)=LOWER('HUAWEI\/\華\爲') AND LOWER(title) LIKE LOWER('%%\攝\像\頭%%'), sales, 0)), 2) AS sales3_brands,
ROUND(SUM(if(LOWER(title) LIKE LOWER('%%\攝\像\頭%%'), sales, 0)), 2) AS sales3_no_brands
FROM tmall_wide_idx2
WHERE (category_id IN (50003695, 350407, 350402, 50008555, 50019790, 50012144, 50019649, 50008652, 120611, 121704, 50011867, 50008109, 50016107, 50008739, 50005266, 50024944, 1205, 50005050, 50005174, 50228001, 126412033, 125538002, 350203, 50011883, 1101))
     AND (price >= 0) AND (size >= 0)
        AND (multiSearchAny(LOWER(title), ['集顯', '聲紋', '攝像頭']))
    AND `month` >= '2020-01-01'
    AND `month` <= '2020-05-01'


 

 
總結
其他的優化思路都大同小異
主要就是使用clickhouse內置方法,減少查詢次數,提高查詢效率
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章