묻고 답해요
141만명의 커뮤니티!! 함께 토론해봐요.
인프런 TOP Writers
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
윈도우 함수 - 탐색함수 연습 문제1번-- 1번. user들의 다음 접속 월과 다다음 접속 월을 구해라. SELECT user_id, visit_month, LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS visit_after_one_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS visit_after_two_month FROM advanced.analytics_function_01 ORDER BY user_id; -- LEAD 함수는 항상 ORDER BY가 필요함!=> LEAD는 순서를 알아야 동작하는 함수이기 때문에~!!2번-- 2번. user들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구해라. SELECT user_id, visit_month, LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS visit_after_one_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS visit_after_two_month, LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS visit_before_one_month FROM advanced.analytics_function_01 ORDER BY user_id;3번-- 3번. user가 접속했을 때, 다음 접속까지의 간격을 구해라. -- LAG으로 이전 월을 구한 뒤, 현재 행과 뺄셈 진행하기.. (방문 텀을 구하는 것이므로.. 현재 달에 이전 월을 빼는게 더 맞는 방향이라 생각함....!) WITH base AS( SELECT *, LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS before_visit_month FROM advanced.analytics_function_01 ) SELECT *, (visit_month - before_visit_month) AS visit_interval FROM base; -- 피연산자에 null값이 포함되어 있으면 해당 연산의 결과는 NULL이 된다. -- 3번을 서브쿼리 없이 짠다면.. -- SELECT -- *, -- LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS before_visit_month, -- visit_month - LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS gap -- FROM advanced.analytics_function_01 -- 하지만!! 여기서 만약 before_visit_month에 대한 내용을 수정한다고하면.. before_visit_month와 gap 두 부분을 수정해야한다... => 과연 괜찮은 쿼리일까? -- 쿼리가 길어지는 것을 무서워하지 말고, 쿼리를 덜 수정할 수 있는 구조를 만드는게 더 좋음!!!추가문제-- 추가문제 -- 이 데이터셋을 기준으로 user_id의 첫 번째 방문 월, 마지막 방문 월을 구해라. SELECT *, FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS first_visit_month, LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS last_visit_month FROM advanced.analytics_function_01윈도우 함수 - FRAME 연습 문제1번-- 1)우리회사의모든주문량은? -- amount_total SELECT *, SUM(amount) OVER (ORDER BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS amount_total FROM advanced.orders;2번-- 2)특정주문시점에서누적주문량은? -- cumulative_sum SELECT *, SUM(AMOUNT) OVER (ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum FROM advanced.orders;3번-- 3)고객별주문시점에서누적주문량은? -- cumulative_sum_by_uesr SELECT *, SUM(AMOUNT) OVER (PARTITION BY user_id ORDER BY order_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum_by_uesr FROM advanced.orders;4번-- 4)최근직전5개의평균주문량은? -- last_5_orders_avg_amount SELECT *, SUM(AMOUNT) OVER (ORDER BY order_date ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amount FROM advanced.orders; -- 정석 답 SELECT *, SUM(amount) OVER() AS amount_total, SUM(amount) OVER(ORDER BY order_id) AS cumulatvie_sum, SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulatvie_sum_by_user, AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amount FROM advanced.orders ORDER BY user_id, order_date; 윈도우 함수 연습 문제1번# 1. 사용자별 쿼리를 실행한 총 횟수를 구해라. SELECT *, COUNT(*) OVER (PARTITION BY user) AS total_query_cnt FROM advanced.query_logs;2번# 2. 주차별로 팀 내에게 쿼리를 가장 많이 사용한 사람들 보여주세요. -- week_number | team | user | query_cnt | team_rank -- week_number는 format_date("%U", query_date) 이용 -- 팀 내에서 쿼리를 가장 많이 사용한 사람.. -- 한 사람은 하나의 팀에만 속해있음 -- => Rank구할 때 PARTITION으로 팀으로 나누고 ORDER BY 에 query_cnt한 다음에.. 근데 이때 주차로도 나눠져야함.... PARTITION에 두 가지가 가능할까? => 애초에 주차별로 나눠서 count했어야 함..! -- 혼자 풀어본 쿼리 WITH base AS ( SELECT *, FORMAT_DATE("%U", query_date) AS week_number, COUNT(*) OVER (PARTITION BY user, FORMAT_DATE("%U", query_date)) AS query_cnt FROM advanced.query_logs ) SELECT DISTINCT week_number, team, user, query_cnt, RANK() OVER (PARTITION BY team ORDER BY query_cnt DESC) AS team_rank FROM base QUALIFY team_rank = 1 ORDER BY week_number, team; -- 문제 의도.. 에 철저하게 당했(?)다 -- window연습문제니까 window만 생각하게 됨...! -- 2번 정석 정답 -- 팀, 주차별 실행 횟수 count하기 -- 위의 결과를 바탕으로 랭킹 구한 뒤, QUALFY로 1등만 뽑기 -- WITH count_of_week AS ( -- SELECT -- EXTRACT(WEEK FROM query_date) AS week_number, -- team, -- user, -- COUNT(query_date) AS query_cnt -- FROM advanced.query_logs -- GROUP BY ALL -- ) -- SELECT -- *, -- RANK() OVER (PARTITION BY week_number, team ORDER BY query_cnt DESC) AS team_rank -- FROM count_of_week -- QUALIFY team_rank = 1 -- ORDER BY week_number, team3번# 3번. 2번 문제에서 사용한 주차별 쿼리를 사용하여 쿼리 실행시점 기준 1주 전 쿼리 실행 수를 별도의 컬럼을 확인할 수 있는 쿼리 작성하기 -- user | team | week_number | query_count | prev_week_query_count -- from count_of_week -- LAG(query_count) OVER (PARTITION BY user ORDER BY week_number) SELECT user, team, week_number, query_cnt, LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_qeury_cnt FROM count_of_week ORDER BY user, week_number;4번# 4번. 시간의 흐름에 따라 일자별로 유저가 실행한 누적 쿼리수를 작성해라. -- count -- frame설정 - ROWS UNBOUNDED PRECEDING AND CURRENT ROW -- 혼자 풀어본 리 SELECT *, COUNT(user) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_cnt FROM advanced.query_logs ORDER BY user, query_date; -- 일자별 흐름. 일자별 누적합.. 먼저 "일자별"로 합을 구해뒀어야했음. -- 정석 쿼리 -- SELECT -- *, -- SUM(query_count) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_cnt -- FROM ( -- SELECT -- *, -- COUNT(*) AS query_count -- FROM `advanced.query_logs` -- GROUP BY ALL) -- ORDER BY user, query_date 5번# 5번. 다음 데이터는 주문 횟수를 나타낸 데이터임. 만약 주문 횟수가 없으면 NULL이 기록된다. NULL로 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해라. -- WITH raw_data AS ( -- SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL -- SELECT DATE '2024-05-02', 13 UNION ALL -- SELECT DATE '2024-05-03', NULL UNION ALL -- SELECT DATE '2024-05-04', 16 UNION ALL -- SELECT DATE '2024-05-05', NULL UNION ALL -- SELECT DATE '2024-05-06', 18 UNION ALL -- SELECT DATE '2024-05-07', 20 UNION ALL -- SELECT DATE '2024-05-08', NULL UNION ALL -- SELECT DATE '2024-05-09', 13 UNION ALL -- SELECT DATE '2024-05-10', 14 UNION ALL -- SELECT DATE '2024-05-11', NULL UNION ALL -- SELECT DATE '2024-05-12', NULL -- ), filled_null AS (SELECT -- date, -- IF(number_of_orders IS NULL, LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING), number_of_orders) AS number_of_orders -- FROM raw_data) -- frame을 위와 이 설정해주지 않아도 됐음!! -- 어짜피 null값을 제외한 마지막 값을 리턴하니까..!!!!!6번# 6번. 5번 문제에서 null을 채운 후, 2일 전~ 현재 데이터의 평균을 구하는 쿼리를 작성해라. SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_null7번# 7번. app_logs 테이블에서 custom session을 만들어라.이전 이벤트 로그와 20초가 지나면 새로운 session을 만들 것. -- event_date | event_timestamp | event_datetime | event_name | user_id | user_pesudo_id | before_event_datetime | second_diff | session_start | session_id -- event_datetime, before_event_datetime, second_diff, session_start, session_id 컬럼을 생성해야한다. -- timestamp_micros()이용하여 timestamp만들고 datetime으로 'Asia/Seoul' -- LAG로 before_datetime -- datetime_diff 함수로 secound_diff -- session_start가 1 아니면 null -- WITH base AS ( -- SELECT -- event_date, -- event_timestamp, -- DATETIME(timestamp_micros(event_timestamp), 'Asia/Seoul') as event_datetime, -- event_name, -- user_id, -- user_pseudo_id, -- DATETIME(timestamp_micros(LAG(event_timestamp) OVER (ORDER BY event_timestamp)), 'Asia/Seoul') AS before_event_datetime, -- from advanced.app_logs -- where event_date = '2022-08-18' -- AND user_pseudo_id = -- '1997494153.849199901' -- ) -- SELECT -- *, -- datetime_diff(event_datetime, before_event_datetime, second) AS second_diff, -- IF(datetime_diff(event_datetime, before_event_datetime, second) >= 20, 1, NULL) AS session_start -- from base -- ORDER BY event_timestamp -- 첫 번째 값의 NULL값 처리 못함 -- session_id 설정 못함.. -- 정석 쿼리 WITH base AS ( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = '1997494153.8491999091' ), diff_data AS( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, LAG(event_datetime, 1) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime, FROM base ) ) SELECT *, SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id FROM ( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 ELSE NULL END AS session_start FROM diff_data ) ORDER BY event_datetime 기억할 것윈도우 함수에 ORDER BY를 설정한다고 쿼리 결과값이 정렬되는 것은 아니다.(단지 함수를 위한 정렬일 뿐!) FROM 다음에 ORDER BY를 써줘야 쿼리 결과가 정렬되어 나타난다.frame 설정 없이도 기본값은 UNBOUNDED PRECEDING AND CURRENT ROW와 값이 같다. 하지만, 명시적 표현으로 해주는 것이 더 좋다LAG의 옵션으로 IGNORE NULLS는 없다. NULL을 구분하여 처리하고싶다면 FIRST_VALUE, LAST_VALUE 등의 함수를 이용해야 한다.세션을 구하려면 쿼리가 많아지는건 어쩔 수 없는 일! (쿼리가 많아진다고 겁 먹지 말고 일단 단계별로 진행해보자)하루 방문 횟수가 많다 하면 세션 기반이 좋을 수 있고 아니라면 그냥 일자별 유저 집계가 나을 수도 있다. (session 범위 기준은 데이터를 탐색하며 결정)time_diff 함수 사용할 때, 입력되는 데이터 타입에 맞춰 함수 이름을 써줘야 한다. (입력되는 데이터 타입이 timestamp라면 timestamp_diff 함수를 사용해야 하고, datetime이라면 datetime_diff 함수를 사용해야 한다.)'어떤 타이밍에 서브쿼리를 이용하는 것이 좋은 타이밍일까...' 를 고민했는데, 그냥 써야할 때 쓰면 되겠다고 생각했다. (코드가 길어지는 것을 두려워하지 말자~!)
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝 스터디 2주차 과제] 윈도우 함수 연습 문제
#1SELECT user_id, visit_month, LEAD(visit_month,1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_month, LEAD(visit_month,2) OVER(PARTITION BY user_id ORDER BY visit_month) AS the_month_after_next FROM `avdanced.analytics_function_01`#2SELECT user_id, visit_month, LEAD(visit_month,1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_month, LEAD(visit_month,2) OVER(PARTITION BY user_id ORDER BY visit_month) AS the_month_after_next, LAG(visit_month,1) OVER(PARTITION BY user_id ORDER BY visit_month) AS last_month FROM `avdanced.analytics_function_01` FrameSELECT SUM(amount) OVER() AS amount_total, #SUM(amount) OVER(partition by order_date) AS cumulative_sum, SUM(amount) OVER (ORDER BY order_date) AS cumulative_sum, #SUM(amount) OVER(partition by user_id) AS cumulative_sum_by_user, SUM(amount) OVER(partition by user_id ORDER BY order_id) AS cumulative_sum_by_user, AVG(amount) OVER(ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amount, AVG(amount) OVER(ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) AS last_5_unbounded_orders_avg_amount, AVG(amount) OVER(ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS all_orders_avg_amount FROM `avdanced.orders` 2)#1SELECT user, team, query_date, count(user) over(PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY query_date, user#2WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, RANK() OVER (PARTITION BY week_number, team ORDER BY query_cnt DESC) AS rk FROM query_cnt_by_team QUALIFY rk = 1 ORDER BY week_number, team, query_cnt DESC#3WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, LAG(query_cnt, 1) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM query_cnt_by_team ORDER BY user, week_number#4SELECT query_date, team, user, query_cnt, SUM(query_cnt) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum FROM ( SELECT query_date, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) ORDER BY user, query_date#5WITH raw_data AS( SELECT DATE'2024-05-01'AS date,15 AS number_of_orders UNION ALL SELECT DATE'2024-05-02', 13 UNION ALL SELECT DATE'2024-05-03', NULL UNION ALL SELECT DATE'2024-05-04', 16 UNION ALL SELECT DATE'2024-05-05', NULL UNION ALL SELECT DATE'2024-05-06', 18 UNION ALL SELECT DATE'2024-05-07', 20 UNION ALL SELECT DATE'2024-05-08', NULL UNION ALL SELECT DATE'2024-05-09', 13 UNION ALL SELECT DATE'2024-05-10', 14 UNION ALL SELECT DATE'2024-05-11', NULL UNION ALL SELECT DATE'2024-05-12', NULL ) SELECT date, IF(number_of_orders is null , last_value(number_of_orders IGNORE NULLS) OVER(ORDER BY date asc), number_of_orders) AS number_of_orders_not_null FROM raw_data;#6WITH raw_data AS( SELECT DATE'2024-05-01'AS date,15 AS number_of_orders UNION ALL SELECT DATE'2024-05-02', 13 UNION ALL SELECT DATE'2024-05-03', NULL UNION ALL SELECT DATE'2024-05-04', 16 UNION ALL SELECT DATE'2024-05-05', NULL UNION ALL SELECT DATE'2024-05-06', 18 UNION ALL SELECT DATE'2024-05-07', 20 UNION ALL SELECT DATE'2024-05-08', NULL UNION ALL SELECT DATE'2024-05-09', 13 UNION ALL SELECT DATE'2024-05-10', 14 UNION ALL SELECT DATE'2024-05-11', NULL UNION ALL SELECT DATE'2024-05-12', NULL ), filled_data AS ( SELECT * EXCEPT(number_of_orders), LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data#7WITH base AS( SELECT event_date, event_timestamp, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id, DATETIME(TIMESTAMP_MICROS(LAG(event_timestamp) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp)), 'Asia/Seoul') AS before_event_datetime FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = '1997494153.8491999091' ), session_info AS( SELECT *, TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff, CASE WHEN TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 OR TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL THEN 1 ELSE NULL END AS session_start FROM base ) SELECT *, SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id FROM session_info ORDER BY event_date, event_timestamp;
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습문제
-- 함수(컬럼) OVER (PARTITION BY 파티션 컬럼 ORDER BY 정렬할 컬럼) AS -- LEAD, LAG -> 함수(컬럼,순서). 순서를 명시하지 않으면 디폴트 1 -- 1) select *, LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead_visit_month, LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead2_visit_month FROM advanced.analytics_function_01 ORDER BY user_id ; -- 여기서 *를 쓰고 진행하는 이유가 있나? -- 2) select *, LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month, LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month, LAG(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS before_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month ; -- 3) select *, LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month, LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) - visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month ; -- 서브쿼리 SELECT *, after_visit_month-visit_month AS diff FROM ( select *, LEAD(visit_month,1) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month FROM advanced.analytics_function_01 ) ORDER BY user_id, visit_month -- 쿼리가 길어지는 것을 무서워하지말고 쿼리를 덜 수정하는 방향으로 하자 -- # 추가문제 : 이 데이터셋 기준으로 user_id의 첫번째 방문월, 마지막방문월 구하는 쿼리 작성하기 first_value, last_value SELECT *, FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS first_visit_month, LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS last_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month ; -- 4) 시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요 SELECT *, SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_sum, SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum2 FROM ( SELECT query_date, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) ORDER BY user, query_date -- 5) WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL UNION ALL ) SELECT *, LAST_VALUE(number_of_order IGNORE NULLS) OVER (ORDER BY date) AS last_value_orders FROM raw_data -- 6) WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ), filled_data AS ( SELECT * EXCEPT(number_of_orders), **LAST_VALUE**(number_of_orders **IGNORE NULLS**) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data -- 7) app_logs 테이블에서 Custom Session을 만들어 주세요. 이전 이벤트 로그와 20초가 지나면 새로운 Session을 만들어 주세요. Session은 숫자로 (1, 2, 3 …) 표시해도 됩니다 -- 2022-08-18일의 user_pseudo_id(1997494153.8491999091)은 session_id가 4까지 나옵니다 WITH base AS ( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id -- event_params가 필요하면 UNNEST => PIVOT해서 사용하면 됨 FROM advanced.app_logs WHERE event_date = "2022-08-18" AND user_pseudo_id = "1997494153.8491999091" -- WHERE 조건에 이벤트를 필터링해서 계산해도 괜찮음. 여기서는 필터링을 하지 않고, 진행 ), diff_data AS ( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff # second_diff 기반으로 새로운 세션의 시작일지, 아닐지를 판단할 수 있음 FROM ( SELECT *, LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime # event_datetime이랑 prev_event_datetime을 빼서 20초가 넘으면 새로운 세션으로 정의. # 20초가 넘지 않으면 기존 세션 -- DATETIME_DIFF() => 차이를 구할 수 있음 FROM base ) ) SELECT *, # 누적합을 사용해서 session_number를 만들었다! SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num # session을 구할 때 쿼리가 길어질 수 있음. 하루에 접속을 여러번 하는 서비스 => session 기반이 좋을 수 있고, 아니라고 하면 일자별 유저 집계가 나을 수 있다 FROM ( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 30 THEN 1 # session을 나누는 기준 초. 데이터를 탐색하면서 결정. 보통 앱 로그에서는 30초, 60초 ELSE 0 END AS session_start # session이 시작됨을 알리는 session_start FROM diff_data ) ORDER BY event_datetime
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
1. 윈도우 함수 탐색 함수 연습 문제1-1) 유저의 다음 접속 월과 다다음 접속 월 구하기SELECT user_id , visit_month , LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS lead_visit_month , LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS lead2_visit_month FROM advanced.analytics_function_01 1-2) 이전 접속월 구하기SELECT user_id , visit_month , LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS before_visit_month FROM advanced.analytics_function_01 1-3) 다음 접속 월까지의 간격 구하기SELECT * , after_visit_month - visit_month AS diff_month FROM ( SELECT user_id , visit_month , LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month FROM advanced.analytics_function_01 ) 1-4) 첫번째와 마지막 방문 월 구하기SELECT user_id , visit_month , FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS first_visit_month , LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS last_visit_month FROM advanced.analytics_function_01 2. 윈도우 함수 Frame 연습 문제SELECT * , SUM(amount) OVER () AS amount_total , SUM(amount) OVER (ORDER BY order_id) AS cumulative_sum , SUM(amount) OVER (PARTITION BY user_id ORDER BY order_id) AS user_cumulative_sum , AVG(amount) OVER (ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_avg FROM advanced.orders 3. 윈도우 함수 연습 문제1-1) 사용자 별 쿼리를 실행한 총 횟수 구하기SELECT user , team , query_date , COUNT(*) OVER(PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY user, query_date. -- 검증 1-2) 주차 별로 팀 내에서 쿼리를 많이 실행한 수, 단 랭킹이 1등인 경우만 구하기SELECT week_number , team , user , query_cnt , RANK() OVER(PARTITION BY week_number, team ORDER BY query_cnt DESC) AS team_rank FROM ( SELECT EXTRACT(WEEK FROM query_date) AS week_number , team , user , COUNT(query_date) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) QUALIFY team_rank=1 ORDER BY week_number 1-3) 쿼리 실행 시점 기준 1주 전 쿼리 실행 수 구하기WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number , team , user , COUNT(query_date) AS query_cnt FROM advanced.query_logs GROUP BY EXTRACT(WEEK FROM query_date) , team , user ) SELECT * , LAG(query_cnt) OVER(PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM query_cnt_by_team ORDER BY user 1-4) 시간의 흐름에 따라 일자 별 유저가 실행한 누적 쿼리 수 구하기WITH query_cnt_by_user AS ( SELECT user , team , query_date , COUNT(query_date) AS query_count FROM advanced.query_logs GROUP BY ALL ) SELECT user , team , query_date , query_count , SUM(query_count) OVER(PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_cnt -- , SUM(query_count) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_query_cnt2 FROM query_cnt_by_user -- 검증 -- QUALIFY cumulative_query_cnt != cumulative_query_cnt2 ORDER BY user, query_date*집계 분석 함수를 사용하고 ORDER BY가 있는 경우 Frame의 디폴트 값은 UNBOUNDED PRECEDING ~ CURRENT ROW 이다. 1-5) 값이 null인 경우 직전 값으로 채우기null 값이 2번 연속 있는 경우가 있어 COALESCE를 사용했습니다.WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date -- , number_of_orders , COALESCE( number_of_orders , LAG(number_of_orders, 1) OVER(ORDER BY date) , LAG(number_of_orders, 2) OVER(ORDER BY date) ) AS number_of_orders FROM raw_data 성윤님의 풀이를 보고 LAST_VALUE + IGNORE NULLS를 사용하면 연속되는 null의 수에 상관없이 직전 값을 채울 수 있어 깔끔하다는 생각이 들었습니다!SELECT * , LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS last_value_orders FROM raw_data 1-6) 같은 데이터 셋에서 1일 전 부터 현재까지의 평균 구하기SELECT * , AVG(last_value_orders) OVER(ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM ( SELECT date , LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS last_value_orders FROM raw_data ) 1-7) 20초를 기준으로 세션 아이디 구하기WITH base AS ( SELECT event_date , user_pseudo_id , event_name , DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime FROM advanced.app_logs WHERE user_pseudo_id='1997494153.8491999091' AND event_date="2022-08-18" ORDER BY event_timestamp ) , base_second_dff AS ( SELECT * , DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff FROM ( SELECT * , LAG(event_datetime) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) before_event_datetime FROM base ) ) , result AS( SELECT * -- CASE 문 사용 가능 , IF(second_diff - before_second_diff > 20 OR second_diff IS NULL, 1, null) AS session_start FROM ( SELECT * , LAG(second_diff) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS before_second_diff FROM base_second_dff ) ) SELECT * , SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id FROM result ✔ 느낀 점프레임은 처음 보는 개념이라 초반에는 어색했지만 문제를 풀면서 익숙해질 수 있었다. 누적합을 구할 때 등 유용하게 사용할 것 같다."쿼리가 길어지는 것을 무서워하지말고 쿼리를 잘 수정할 수 있는 구조를 만들자" 일할 때에도 동료들이 수정하고 테스트하기 쉬운 구조인가?를 생각해보자.CTE의 이름을 짓는건 언제나 고민된다. 서브쿼리를 적절하게 사용하자! ✔ 새롭게 알게된 점ROW_NUMBER를 사용할 때 중복값이 있으면 순서가 보장되지 않을 수 있다. 이때는 OVER 안에서 id를 사용하자.FIRST_VALUE는 MIN과 같이 동작하지만, LAST_VALUE는 MAX가 아니다.전체 집계 값을 구하고 싶을 때에는 OVER 안에 아무것도 안 쓰면 된다.윈도우 함수에 조건을 걸고 싶을 땐 QUALIFY를 사용하자. 연속된 NULL값을 채우는 경우 LAST_VALUE와 IGNORE NULLS조합을 사용하자.
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
1. 윈도우 함수 연습문제 풀이문제 1)사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리를 작성. 단, GROUP BY를 사용해서 집계하는 것이 아닌 query_logs의 데이터 우측에 새로운 컬럼을 만들어주세요SELECT user, team, query_date, COUNT(user) OVER (PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY query_date, team, user ;문제 2)주차별로 팀 내에서 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결과가 보이도록 해주세요.WITH weekly_team_query_cnt AS ( SELECT EXTRACT(week FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT week_number, team, user, query_cnt, ROW_NUMBER() OVER (PARTITION BY week_number, team ORDER BY query_cnt DESC) AS team_rank FROM weekly_team_query_cnt WHERE 1=1 QUALIFY team_rank = 1 ORDER BY week_number, team ;문제 3)(2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요WITH weekly_team_query_cnt AS ( SELECT EXTRACT(week FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT user, team, week_number, query_cnt, LAG(query_cnt, 1) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM weekly_team_query_cnt ORDER BY user, team, week_number ;문제 4)시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성WITH timeseries_user_query_count AS ( SELECT user, team, query_date, COUNT(user) AS query_count FROM advanced.query_logs GROUP BY ALL ) SELECT user, team, query_date, query_count, SUM(query_count) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_query_count FROM timeseries_user_query_count문제 5)다음 데이터는 주문 횟수를 나타낸 데이터입니다. 만약 주문 횟수가 없으면 NULL로 기록됩니다. 이런 데이터에서 NULL 값이라고 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요.WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data ;문제 6)5번 문제에서 NULL을 채운 후, 2일 전 ~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요(이동 평균)WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, number_of_orders, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM ( SELECT * EXCEPT(number_of_orders), LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) ;문제 7)** 새로운 Session 부여하기본인 해결 쿼리) WITH create_new_session_app_logs_raw_data AS ( SELECT *, LAG(event_datetime) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS before_event_datetime FROM ( SELECT event_date, event_timestamp, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id, FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = '1997494153.8491999091' ) ), create_new_session_app_logs AS ( SELECT *, IF(second_diff is null OR second_diff > 20, 1, null) AS session_start FROM ( SELECT *, EXTRACT(second FROM event_datetime - before_event_datetime) AS second_diff FROM create_new_session_app_logs_raw_data ) ) SELECT *, SUM(session_start) OVER (ORDER BY event_timestamp) AS session_id FROM create_new_session_app_logs ;카일스쿨님 해결 쿼리)본인 해결 쿼리와의 차이점 second_diff 새로운 컬럼 생성 시 DATETIME_DIFF(col1, col2, 시간단위)라는 함수 적용CASE WHEN문을 적용하여 직관적이고 수정하기 용이한 형태의 쿼리 사용 WITH base AS ( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id, FROM advanced.app_logs WHERE event_date = "2022-08-18" AND user_pseudo_id = "1997494153.8491999091" ORDER BY event_timestamp ), diff_data AS ( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff # second_diff 기반으로 새로운 세션의 시작일지, 아닐지를 판단할 수 있음 FROM ( SELECT *, LAG(event_datetime, 1) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime, # event_datetime이랑 prev_event_datetime을 뺴서 20초가 넘으면 새로운 세션으로 정의 # 20초가 넘지 않으면 기존 세션 FROM base ) ) SELECT *, # 누적합을 사용해서 session_number를 만들었다! SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num # session을 구할 때 쿼리가 길어질 수 있음. 하루에 접속을 여러번 하는 서비스 => session기반이 좋을 수 있고, 아니라고 한다면 일자별 집계가 나을 수 있음 FROM ( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 # session을 나누는 기준 초. 데이터를 탐색하면서 결정 ELSE NULL END AS session_start # session이 시작됨을 알리는 session_start FROM diff_data ) ORDER BY event_datetime ;
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
2. 윈도우 함수 주요 Point💡윈도우 함수 FRAME 개념은 익숙해지는 것이 중요qualify 개념 ⭐누적합 개념 사용해 세션 구하기 !! </aside> 2-1. 윈도우 함수 rows, range calculation연습문제 FRAME/* - 토픽: frame 연습문제 - 추가 컬럼 -- 1) amount_total -- 2) cumulative_sum -- 3) cumulative_sum_by_user -- 4) last_5_orders_avg_amount */ select * , sum(amount) over() as amount_total , sum(amount) over(order by order_id) as cumulative_sum , sum(amount) over(partition by user_id order by order_id) as cumulative_sum_by_user , sum(amount) over(order by order_id rows between 5 preceding and 1 preceding) as last_5_orders_avg_amount from advanced.orders ; 2-2. Qualify 개념윈도우 함수로 생성된 변수를 having 처리하는 것처럼 사용할 수 있음E.g. qualify amount_total ≥ 5002-3. 윈도우 함수 연습문제 모음윈도우 함수 연습문제/* - 연습문제 1 - window 활용 > partition by 활용 목적 */ select * , count(*) over(partition by user) as total_query_cnt from advanced.query_logs order by user, query_date ; /* - 연습문제 2 - window 활용 > partition by & qualify 활용 (훨씬 편하다!) - 해설 풀이 자체는 처음에 group by 쓰면 더 유용 (그럴 것 같음) */ with tmp as ( select week_number , team , user , count(*) over(partition by week_number, team, user) as query_cnt from ( select * , extract(week from query_date) as week_number from advanced.query_logs ) t ) select * , row_number() over(partition by week_number, team order by query_cnt desc) as team_rnk from tmp qualify team_rnk = 1 order by week_number, team ; /* - 연습문제 3 - window 활용 > lag 함수 목적 */ with tmp as ( select user , team , week_number , count(*) as query_count from ( select * , extract(week from query_date) as week_number from advanced.query_logs ) t group by user, team, week_number ) select * , lag(query_count) over(partition by user order by week_number) as prev_week_query_count from tmp t ; /* - 연습문제 4 - window 활용 > 누적 count 목적 - order by, partition by 개념 명확히 익히기에 좋음 - 해설 풀이와는 다름 (group by 후 > query_cnt를 sum하는 방식으로 풀이함) > 출제의도 frame 개념 파악 */ select distinct user , team , query_date , count(*) over(partition by user, query_date) as query_count ,count(*) over(partition by user order by query_date) as cumulative_query_count from advanced.query_logs order by 1, 2, 3 ; /* - 연습문제 6 - window 활용 > moving_avg 산출 문제 */ WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , coalesce(number_of_orders, lag(number_of_orders) over(order by date)) as number_of_orders1 , Last_value(number_of_orders ignore nulls) over(order by date) as number_of_orders2 FROM raw_data ; /* - 연습문제 5 - window 활용 > coalesce & lag 활용목적 */ WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) select * , avg(number_of_orders) over(order by date rows between 2 preceding and current row) as moving_avg from ( SELECT date -- , coalesce(number_of_orders, lag(number_of_orders) over(order by date)) as number_of_orders1 , Last_value(number_of_orders ignore nulls) over(order by date) as number_of_orders FROM raw_data ) ; 2-4. 세션 구하기 문제세션 구하기 연습문제/* - 연습문제 7 - session 구하기 - diff & 누적합 개념으로 세션 구하는 것이 포인트 (누적합 개념 신선!) */ with base as ( select event_date , datetime(timestamp_micros(event_timestamp), 'Asia/Seoul') as event_datetime , event_name , user_id , user_pseudo_id from advanced.app_logs ), diff_date as ( select * , datetime_diff(event_datetime, prev_event_datetime, second) as second_diff from ( select * , lag(event_datetime) over(partition by user_pseudo_id order by event_datetime) as prev_event_datetime from base ) ) select * , sum(Session_Start) over(partition by user_pseudo_id order by event_datetime) as session_num from ( select * , case when prev_event_Datetime is null then 1 when second_diff >= 20 then 1 else 0 end as session_start from diff_date ) order by event_Datetime, user_pseudo_id ;
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
집계 함수Aggregation FunctionGROUP BY와 함께 사용한 COUNT나 AVG는 컬럼의 특정 Value를 기준으로 집계하면서 하나의 값을 반환합니다. 윈도우 함수다음과 같은 경우 유용한 함수A유저의 전 주문 수량은 얼마나 될까?B유저가 물건을 구입하기 전에 상세보기 페이지를 얼마나 방문했을까?C유저의 특정 구매 시점별 누적 구매 횟수는?특정 카테고리 내에서 제일 많이 방문한 제품은?이런 경우 모두 윈도우 함수(분석 함수)를 사용하면 편리함윈도움 함수를 사용하지 않으면 서브 쿼리와 JOIN 등을 사용해서 만들어야 함 집계 함수와 다르게 윈도우 함수는 각행마다 단일 값을 반환함종류 탐색 함수 : LEAD, LAG, FIRST_VALUE, LAST_VALUE번호 지정 함수 : RANK, DENSE_RANK, PERCENT_RANK, CUME_DIST, NTITLE집계 분석 함수: 집계 함수들, AVG, COUNT, SUM, MAX, MIN 문법함수 이름(컬럼, OFFSET) OVER (PARTITION BY 파티션_컬럼 ORDER BY 정렬_컬럼)OFFSET : 값을 가져올 행의 위치, 기본 값은 1이고 생략 가능 함수 이름 (컬럼) OVER (PARTITON BY 파티션_컬럼 ORDER BY 정렬_컬럼)필요에 따라 PARTITION BY는 생략 가능 #문제1 SELECT user_id, visit_month, lead(visit_month, 1) over(partition by user_id order by visit_month asc) as next_visit, lead(visit_month, 2) over(partition by user_id order by visit_month asc) as next_next_visit FROM workspace.analytics_function_01 ORDER BY user_id, visit_month ; #문제2 SELECT user_id, visit_month, lead(visit_month, 1) over(partition by user_id order by visit_month asc) as next_visit, lead(visit_month, 2) over(partition by user_id order by visit_month asc) as next_next_visit, lag(visit_month, 1) over(partition by user_id order by visit_month asc) as prev_visit FROM workspace.analytics_function_01 ORDER BY user_id, visit_month ; #문제3 SELECT user_id, visit_month, lead(visit_month, 1) over(partition by user_id order by visit_month asc) as next_visit_month, lead(visit_month, 1) over(partition by user_id order by visit_month asc) - visit_month as next_visit_month_diff FROM workspace.analytics_function_01 ORDER BY user_id, visit_month ; #추가문제 SELECT DISTINCT user_id, first_value(visit_month) over(partition by user_id order by visit_month asc rows between unbounded preceding and unbounded following) as first_visit_month, last_value(visit_month) over(partition by user_id order by visit_month asc rows between unbounded preceding and unbounded following) as last_visit_month FROM workspace.analytics_function_01 ORDER BY user_id ; #문제4 SELECT *, sum(amount) over() as total_amount, sum(amount) over(order by order_id asc rows between unbounded preceding and current row) as cumulative_sum, sum(amount) over(partition by user_id order by order_id asc rows between unbounded preceding and current row) as cumulative_sum_by_user, avg(amount) over(order by order_id asc rows between 5 preceding and 1 preceding) as last_five_orders_avg_amount FROM workspace.orders ORDER BY order_id ; #연습문제1 SELECT *, count(*) over(partition by user) as query_count_by_users FROM workspace.query_logs ; #연습문제2 SELECT query_weeknum, team, user, query_count, rank() over(partition by query_weeknum, team order by query_count desc) as query_rank FROM ( SELECT extract(week from query_date) as query_weeknum, team, user, count(1) as query_count FROM workspace.query_logs GROUP BY ALL ) QUALIFY query_rank = 1 ORDER BY query_weeknum ; #연습문제3 SELECT team, user, query_weeknum, query_count, lag(query_count, 1) over(partition by team, user order by query_weeknum asc) as prev_week_query_count FROM ( SELECT team, user, extract(week from query_date) as query_weeknum, count(1) as query_count FROM workspace.query_logs GROUP BY ALL ) #연습문제4 SELECT team, user, query_date, query_count, sum(query_count) over(partition by team, user order by query_date asc rows between unbounded preceding and current row) as cumulative_sum FROM ( SELECT team, user, query_date, count(1) as query_count FROM workspace.query_logs GROUP BY ALL ) ORDER BY team, user, query_date ; #연습문제5 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, ifnull(number_of_orders, real_prev_number_of_orders) as number_of_orders FROM ( SELECT date, number_of_orders, last_value(number_of_orders ignore nulls) over(order by date asc rows between unbounded preceding and 1 preceding) as real_prev_number_of_orders FROM raw_data ) ORDER BY date asc ; #연습문제6 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, number_of_orders, avg(number_of_orders) over(order by date asc rows between 2 preceding and current row) as moving_avg FROM ( SELECT date, ifnull(number_of_orders, real_prev_number_of_orders) as number_of_orders FROM ( SELECT date, number_of_orders, last_value(number_of_orders ignore nulls) over(order by date asc rows between unbounded preceding and 1 preceding) as real_prev_number_of_orders FROM raw_data ) ) ORDER BY date asc ; #연습문제7 WITH total_logs AS ( SELECT user_pseudo_id, event_name, timestamp_micros(event_timestamp) as event_datetime FROM workspace.app_logs ) SELECT user_pseudo_id, event_name, event_datetime, prev_event_datetime, second_diff, sum(session_change) over(partition by user_pseudo_id order by event_datetime asc) as session_id FROM ( SELECT *, case when event_datetime = first_event_datetime then 1 end as session_id, case when second_diff is null or second_diff >= 20 then 1 else 0 end as session_change FROM ( SELECT *, datetime_diff(event_datetime, prev_event_datetime, second) as second_diff FROM ( SELECT user_pseudo_id, event_name, event_datetime, lag(event_datetime, 1) over(partition by user_pseudo_id order by event_datetime asc) as prev_event_datetime, first_value(event_datetime) over(partition by user_pseudo_id order by event_datetime asc) as first_event_datetime FROM total_logs ) ) ) ORDER BY user_pseudo_id, event_datetime ;
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
문제 1. SELECT *, lead(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_1month, lead(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_2month, FROM advanced.analytics_function_01 문제 2. SELECT *, lead(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_1month, lead(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_2month, lag(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS pre_1month FROM advanced.analytics_function_01 문제 3. SELECT *, SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date, order_id ) AS cumulative_sum_by_user 문제 4. SELECT *, AVG(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amout 문제 1. SELECT *, COUNT(query_date) OVER (PARTITION BY user ORDER BY user) AS total_query_cnt FROM advanced.query_logs 문제 2. WITH table AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, RANK() OVER(PARTITION BY week_number ORDER BY query_cnt) AS team_rank FROM table QUALIFY team_rank = 1 문제 3. WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number, team, user, count(user) AS query_cnt FROM advanced.query_logs GROUP BY 1,2,3 ) SELECT *, lag(query_cnt) OVER(PARTITION BY team, user ORDER BY week_number asc) AS prev_week_query_cnt FROM base QUALIFY team_rank = 1 문제 4. WITH base AS ( SELECT user, team, query_date, COUNT(user) as query_count FROM advanced.query_logs GROUP BY 1,2,3 ) SELECT user, team, query_date, query_count, SUM(query_count) OVER(PARTITION BY team, user ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count FROM base ORDER BY team, user, query_date; 문제 5. WITH raw_data AS( SELECT DATE'2024-05-01'AS date,15 AS number_of_orders UNION ALL SELECT DATE'2024-05-02',13 UNION ALL SELECT DATE'2024-05-03',NULL UNION ALL SELECT DATE'2024-05-04',16 UNION ALL SELECT DATE'2024-05-05',NULL UNION ALL SELECT DATE'2024-05-06',18 UNION ALL SELECT DATE'2024-05-07',20 UNION ALL SELECT DATE'2024-05-08',NULL UNION ALL SELECT DATE'2024-05-09',13 UNION ALL SELECT DATE'2024-05-10',14 UNION ALL SELECT DATE'2024-05-11',NULL UNION ALL SELECT DATE'2024-05-12',NULL ) SELECT date, IF(number_of_orders is null , last_value(number_of_orders IGNORE NULLS) OVER(ORDER BY date asc), number_of_orders) AS number_of_orders_not_null FROM raw_data; 문제 6. WITH raw_data AS( SELECT DATE'2024-05-01'AS date,15 AS number_of_orders UNION ALL SELECT DATE'2024-05-02', 13 UNION ALL SELECT DATE'2024-05-03', NULL UNION ALL SELECT DATE'2024-05-04', 16 UNION ALL SELECT DATE'2024-05-05', NULL UNION ALL SELECT DATE'2024-05-06', 18 UNION ALL SELECT DATE'2024-05-07', 20 UNION ALL SELECT DATE'2024-05-08', NULL UNION ALL SELECT DATE'2024-05-09', 13 UNION ALL SELECT DATE'2024-05-10', 14 UNION ALL SELECT DATE'2024-05-11', NULL UNION ALL SELECT DATE'2024-05-12', NULL ), filled_data AS ( SELECT * EXCEPT(number_of_orders), LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data 문제 7. WITH base AS ( SELECT event_date, event_timestamp, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' ), diff_date AS ( SELECT *, DATETIME_DIFF(event_datetime, pre_event_time, second) AS date_diff_sec FROM ( SELECT *, LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime asc) AS pre_event_time FROM base ) ), session_start AS ( SELECT *, CASE WHEN pre_event_time IS NULL THEN 1 WHEN date_diff_sec >= 20 THEN 1 END AS start_session FROM diff_date ) SELECT event_date, event_datetime, event_name, user_id, user_pseudo_id, date_diff_sec, SUM(start_session) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id FROM session_start ORDER BY user_pseudo_id, event_datetime;
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수, FRAME 설정, QUALIFY
2-4. 윈도우 함수 탐색 함수 연습 문제문제 1) User들의 다음 접속 월과 다다음 접속 월을 구하는 쿼리를 작성해주세요 select *, lead(visit_month) over (partition by user_id order by visit_month) as next_visit_month, lead(visit_month, 2) over (partition by user_id order by visit_month) as next_visit_month2 from advanced.analytics_function_01 ; 결과문제 2) User들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구하는 쿼리 작성select *, lead(visit_month, 1) over (partition by user_id order by visit_month) as next_visit_month, lead(visit_month, 2) over (partition by user_id order by visit_month) as next_visit_month2, lag(visit_month, 1) over (partition by user_id order by visit_month) as pre_visit_month from advanced.analytics_function_01 order by user_id, visit_month결과문제 3) user가 접속했을 때, 다음접속까지의 간격을 구하시오 select *, next_visit_month - visit_month as term From ( select *, lead(visit_month, 1) over (partition by user_id order by visit_month) as next_visit_month from advanced.analytics_function_01) order by user_id, visit_month ;결과추가문제 : 이 데이터셋을 User_id의 첫번째 방문 월, 마지막 방문월을 구하는 쿼리 작성 select *, first_value(visit_month) over (partition by user_id order by visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as first_visit_month, last_value(visit_month) over (partition by user_id order by visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_visit_month from advanced.analytics_function_01 order by user_id, visit_month ; 결과2-8. 윈도우 함수 Frame연습 문제Frame 연습문제select *, sum(amount) over() as amount_total, sum(amount) over(order by order_id) as cumulative_sum, -- 누적 sum(amount) over(partition by user_id order by order_id) as cumulative_sum_user, -- 누적 avg(amount) over(order by order_id rows between 5 preceding and 1 preceding) as last_5_avg from `advanced.orders` order by order_date, user_id ;결과 2-11. 윈도우 함수 연습 문제(1번)1) 사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리를 작성해주세요. 단, GROUP BY를 사용해서 집계하는 것이 아닌 query_logs의 데이터의 우측에 새로운 컬럼을 만들어주세요.select *, count(query_date) over() as total_cnt, # 전체 count(query_date) over(partition by user) as total_query_cnt from advanced.query_logs order by user, query_date ;결과2-11. 윈도우 함수 연습 문제(2번~6번)2) 주차별로 팀 내에서 쿼리를 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결과가 보이도록 해주세요with query_cnt_by_team as ( select extract(WEEK from query_date) as week_number, team, user, count(user) as query_cnt from advanced.query_logs group by all) select *, rank() over(partition by week_number, team order by query_cnt desc) as team_rank from query_cnt_by_team qualify team_rank = 1 order by week_number, team, query_cnt desc ;결과3) (2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요with query_cnt_by_team as ( select user, team, extract(WEEK from query_date) as week_number, count(user) as query_cnt from advanced.query_logs group by all) select *, LAG(query_cnt, 1) OVER (partition by user ORDER BY week_number) AS prev_week_query_count -- user 단위, 전주차 week_number from query_cnt_by_team -- order by user -- team, query_cnt desc ; 결과4) 시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요select *, sum(query_cnt) over(partition by user order by query_date) as cumulative_sum, -- 왜되는거지? : frame의 default값 : unbounded preceding ~current row sum(query_cnt) over(partition by user order by query_date rows between unbounded preceding and current row) as cumulative_sum2 from (select user, team, query_date, count(*) as query_cnt from advanced.query_logs group by all) -- 검증 -- qualify cumulative_sum != cumulative_sum2 order by user, query_date ;결과5) 다음 데이터는 주문 횟수를 나타낸 데이터입니다. 만약 주문 횟수가 없으면 NULL로 기록됩니다. 이런 데이터에서 NULL 값이라고 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT *, LAG(number_of_orders) over(order by date) as number_of_orders2, ifnull(number_of_orders, LAG(number_of_orders) over(order by date)) as number_of_orders3, LAST_VALUE(number_of_orders) over(order by date) as last_value_orders, LAST_VALUE(number_of_orders ignore nulls) over(order by date) as last_value_orders2 FROM raw_data ;결과6) 5번 문제에서 NULL을 채운 후, 2일 전 ~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요(이동 평균)WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL) -- select * from raw_data; select * , avg(number_of_orders) over(order by date rows between 2 preceding and current row) as moving_avg from (SELECT * except(number_of_orders), LAST_VALUE(number_of_orders ignore nulls) over(order by date) as number_of_orders FROM raw_data) ;결과2-11. 윈도우 함수 연습 문제(7번)with base as ( select event_date, datetime(timestamp_micros(event_timestamp), 'Asia/Seoul') as event_datetime, event_name, user_id, user_pseudo_id from advanced.app_logs where 1=1 and event_date = "2022-08-18" and user_pseudo_id = '1997494153.8491999091'), diff_data as( select *, datetime_diff(event_datetime, prev_event_datetime, second) as second_diff -- second로 간격설정 # second_diff기반으로 새로운 세션의 시작일지 아닐지 판단할 수 있음 from(select *, lag(event_datetime, 1) over(partition by user_pseudo_id order by event_datetime) as prev_event_datetime # event_datetime이랑 prev_event_datetime을 빼서 20초가 넘으면 새로운 세션으로 정의, # 20초가 넘지 않으면 기존 세션 from base)) select *, sum(session_start) over(partition by user_pseudo_id order by event_datetime) as session_num from(select *, case when prev_event_datetime is null then 1 when second_diff >= 20 then 1 else 0 end as session_start # session이 시작됨을 알리는 session_start from diff_data) order by event_datetime limit 20;결과 느낀점마지막 문제같은 실무에 사용 할 법한 예제 문제들을 더 만들어서 풀어봐야 할 것 같다.session기준에 대한 리서치를 더 해봐야 할 것 같다.
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
문제 1 SELECT *, LEAD(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS lead_visit_month , LEAD(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS lead2_visit_month FROM advanced.analytics_function_01 ORDER BY user_id 문제 2 SELECT *, LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS lead_visit_month, LEAD(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS lead2_visit_month, -- 다다음달 LAG(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS lag_visit_month -- 이전 달 (LAG) FROM `advanced.analytics_function_01` ORDER BY user_id, visit_month 문제 3 SELECT *, LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS after_visit_month, LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) - visit_month AS diff # 윈도우 함수를 이렇게 쓰는게 좋을까? => 중복된 쿼리는 줄이는 것이 좋을 수 있음 FROM `advanced.analytics_function_01` - 총 정리 문제 문제 1 SELECT *, COUNT(query_date) OVER(PARTITION BY user ORDER BY query_date) AS total_query_cnt # 38 row FROM `advanced.query_logs` 문제 2 WITH base AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL) SELECT *, RANK() OVER(PARTITION BY team ORDER BY query_cnt) AS rk FROM base QUALIFY rk = 1 ORDER BY week_number, team, query_cnt DESC 문제 3 WITH base AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL) SELECT *, LAG(query_cnt, 1) OVER(PARTITION BY user ORDER BY week_number) AS prev_week_cnt FROM base 문제 4 SELECT *, SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_sum, SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum2 FROM( SELECT query_date, team, user, COUNT(user) AS query_cnt FROM advanced.q 문제 5 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT *, LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS last_value_orders FROM raw_data 문제 6 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) , filled_date AS( SELECT * EXCEPT(number_of_orders), LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS number_of_orders FROM raw_data) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_date 문제 7 WITH base AS( SELECT EVENT_DATE, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = "1997494153.8491999091" ) , diff_date AS(SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, LAG(EVENT_DATETIME, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime FROM base ORDER BY base.event_datetime)) SELECT *, SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num FROM ( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 ELSE 0 END AS session_start FROM diff_date)
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[ 빠짝 스터디 2주차 ] 윈도우 함수 연습 문제
탐색함수 연습 문제CREATE OR REPLACE TABLE advanced.analytics_function_01 AS ( SELECT 1004 AS user_id, 1 AS visit_month UNION ALL SELECT 1004, 3 UNION ALL SELECT 1004, 7 UNION ALL SELECT 1004, 8 UNION ALL SELECT 2112, 3 UNION ALL SELECT 2112, 6 UNION ALL SELECT 2112, 7 UNION ALL SELECT 3912, 4 ) SELECT * FROM advanced.analytics_function_01 --# 탐색 함수 1 SELECT user_id, visit_month, LEAD(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month FROM advanced.analytics_function_01 ORDER BY user_id --# 탐색 함수 2 SELECT user_id, visit_month, LEAD(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month, LAG(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS before_visit_month FROM advanced.analytics_function_01 ORDER BY user_id --# 탐색 함수 3 SELECT *, after_visit_month - visit_month AS diff FROM ( SELECT user_id, visit_month, LEAD(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month, FROM advanced.analytics_function_01 ORDER BY user_id ) --# 추가 문제 SELECT user_id, visit_month, FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS fisrt_visit, LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) AS LAST_visit, FROM advanced.analytics_function_01 ORDER BY user_id프레임 연습 문제SELECT *, SUM(amount) OVER() AS amount_total, SUM(amount) OVER(ORDER BY order_id) AS cumulative_sum, SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulative_sum_by_user, AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_avg FROM advanced.orders ORDER BY order_id윈도우함수 연습 문제#1 SELECT *, COUNT(query_date) OVER (PARTITION BY user) as total_query_cnt FROM advanced.query_logs ORDER BY user, query_date #2 WITH TBL AS( SELECT EXTRACT(week from query_date) AS week_number ,team ,user ,COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, team, user ) SELECT week_number , team , user , total_query_cnt as query_count , RANK() OVER (PARTITION BY team ORDER BY total_query_cnt desc) AS team_rank FROM TBL QUALIFY team_rank = 1 ORDER BY week_number,team #3 WITH TBL AS( SELECT EXTRACT(week from query_date) AS week_number ,team ,user ,COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, team, user ) SELECT user ,team ,week_number ,total_query_cnt as query_count ,LAG(total_query_cnt) OVER (PARTITION BY user order by week_number) AS prev_week_query_count FROM TBL ORDER BY user, week_number; #4 SELECT user , team , query_date , query_count , SUM(query_count) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count FROM (SELECT user ,team ,query_date ,COUNT(query_date) AS query_count FROM advanced.query_logs GROUP BY user,team,query_date ) ORDER BY user, query_date #5 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , LAST_VALUE(number_of_orders ignore NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data #6 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , number_of_orders , AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) FROM ( SELECT date ,LAST_VALUE(number_of_orders ignore NULLS) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) #7 WITH base AS ( SELECT event_date , event_timestamp , DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul') AS event_datetime , user_id ,user_pseudo_id FROM advanced.app_logs ), diff_sessions AS ( SELECT * , DATETIME_DIFF(event_datetime,prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, LAG(event_datetime) OVER (PARTITION BY user_pseudo_id ORDER BY base.event_datetime) AS prev_event_datetime FROM base ) ) SELECT * , SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_start FROM ( SELECT * , CASE WHEN second_diff IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 ELSE 0 END as session_start FROM diff_sessions ) ORDER BY event_datetime
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습문제
탐색함수 연습문제문제 1) user들의 다음 접속 월과 다다음 접속 월을 구하는 쿼리를 작성해주세요. LEAD는 반드시 정렬이 먼저 되어야 함 -> ORDER BY 추가 SELECT *, LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead_visit_month, LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS lead2_visit_month FROM advanced.analytics_function_01 문제 2) user들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구하는 쿼리를 작성해주세요. SELECT *, LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after_visit_month, LEAD(visit_month,2) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS after2_visit_month, LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ASC) AS before_visit_month ORDER BY user_id, visit_month FROM advanced.analytics_function_01 문제 3) 유저가 접속 했을 때, 다음 접속까지의 간격 SELECT *, after_visit_month - visit_month AS diff FROM ( SELECT *, LEAD(visit_month, 1) OVER (PARTITION BY user_id ORDER BY visit_month) AS after_visit_month FROM advanced.analytics_function_01 ) FRAME 연습문제-- amount_total: 전체 SUM -- cumulative_sum : row 시점에 누적 SUM -- cumulative_sum_by_user : row 시점에 유저별 누적 SUM -- last_5_orders_avg_amount : order_id 기준으로 정렬하고, 직전 5개 주문의 평균 amount -- 집계분석함수() OVER(PARTITION BY ~~~ ORDER BY ~~~ ROWS BETWEEN A AND B) SELECT *, SUM(amount) OVER() AS amount_total, -- OVER()에 아무것도 들어가지 않을 수 있음 SUM(amount) OVER(ORDER BY order_id) AS cumulative_sum, SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulative_sum_by_user, AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) FROM advanced.orders ORDER BY user_id, order_date ***QUALIFY 연습문제QUALIFY(조건 설정) WHERE 대신 QUALIFY를 사용하면 윈도우 함수의 결과에 대해 필터링할 수 있음 WHERE과 같이 사용하는 경우엔 WHERE 아래에 작성하면 됨 SELECT order_id, order_date, user_id, amount, SUM(amount) OVER (PARTITION BY user_id) AS amount_total FROM advanced.orders WHERE 1=1 QUALIFY amount_total >= 500 윈도우 함수 연습문제 (1~7) 문제 1SELECT user, team, query_date, COUNT(query_date) OVER (PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY user, query_date 문제 2 WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number, user, team, COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT week_number, team, user, total_query_cnt, RANK() OVER (PARTITION BY team ORDER BY total_query_cnt DESC) AS ranking_in_team FROM base QUALIFY ranking_in_team = 1 ORDER BY week_number, team 문제 2WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number, user, team, COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT week_number, team, user, total_query_cnt, RANK() OVER (PARTITION BY team ORDER BY total_query_cnt DESC) AS ranking_in_team FROM base QUALIFY ranking_in_team = 1 ORDER BY week_number, team 문제 3WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number, user, team, COUNT(query_date) AS query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT user, team, week_number, query_cnt, LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM base ORDER BY user, week_number 문제 4문제 4 SELECT user, team, query_date, query_count, SUM(query_count) OVER (PARTITION BY user ORDER BY query_date, ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count FROM ( SELECT user, team, query_date, COUNT(query_date) AS query_count FROM advanced.query_logs GROUP BY 1,2,3 ) ORDER BY user, query_date 문제 5문제 5 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data 문제 6WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date, number_of_orders, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM ( SELECT date, LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) 문제 7문제 7 WITH base AS ( SELECT event_date, event_timestamp, DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id, LAG(DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul')) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS before_event_datetime FROM advanced.app_logs WHERE 1=1 AND event_date = '2022-08-18' ) SELECT *, DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff, CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL OR DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END AS session_start, SUM(CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) + 1 AS session_temp FROM base
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습문제
1. 윈도우 함수 연습문제(1) 사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리는 작성해주세요.단, GROUP BY를 사용해서 집계하는 것이 아닌 query_logs의 데이터의 우측에 새로운 컬럼을 만들어주세요.SELECT *, COUNT(query_date) OVER(PARTITION BY user) AS cnt_query FROM advanced.query_logs(2) 주차별로 팀 내에서 쿼리를 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결과가 보이도록 해주세요.WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, RANK() OVER(PARTITION BY week_number, team ORDER BY query_cnt) AS rk FROM query_cnt_by_team QUALIFY rk = 1 ORDER BY week_number, team, query_cnt DESC*주차별 데이터는 EXTRACT 사용(3) (2번 문제에서 사용한 수치별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요.WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, LAG(query_cnt) OVER(PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM query_cnt_by_team(4) 시간이 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요.-- 누적 쿼리 : frame. 과거의 시간(UNBOUNDED PRECEDING) 부터 CUREENT ROW 까지 -- frame의 default SELECT *, SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_sum FROM ( SELECT query_date, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) -- QUALIFY cumulative_sum != cumulative_sum2 -- WHERE, QUALIFY 조건 설정해서 2가지 조건이 같은지 비교 -> 같으면 != 연산 결과에 반환하는 값 없음 ORDER BY user, query_date(5) 다음 데이터는 주문 횟수를 나타낸 데이터입니다. 만약 주문 횟수가 없으면 NULL로 기록됩니다. 이런 데이터에서 NULL 값이라고 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요.WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT *, LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS last_value_orders FROM raw_data*IGNORE NULLS 사용(6) 5번 문제에서 NULL을 채운 후, 2일전 ~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요. (이동 평균)WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) , filled_data AS ( SELECT * EXCEPT(number_of_orders), LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER(ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg FROM filled_data*WITH문 두번 사용 시, ',' 사용(7) app_logs 테이블에서 Custom Session을 만들어 주세요. 이전 이벤트 로그와 20초가 지나면 새로운 Session을 만들어 주세요. Session은 숫자로 (1, 2, 3, ...) 표시해도 됩니다.2022-08-18일의 user_pseudo_id(1997494153.8491999091)은 session_id가 4까지 나옵니다. WITH base AS( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_date, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = "2022-08-18" AND user_pseudo_id = "1997494153.8491999091" ) , diff_data AS ( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime FROM base ORDER BY event_datetime ) ) SELECT *, SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num -- session을 구할 때 쿼리가 길어질 수 있음. 하루에 접속을 여러번 하는 서비스 -> session 기반이 좋을 수 있고, 아니라고 하면 유저 집계가 나올 수 있음 FROM ( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 # 세션을 나누는 기준 초, 데이터를 탐색하면서 결정. ELSE NULL END AS session_start FROM diff_data ) *세션 기준을 직접 정의 가능
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
1번 SELECT *, LEAD(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS nnext_visit_month FROM advanced.analytics_function_01 2번 SELECT *, LEAD(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS nnext_visit_month, LAG(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS previous_visit_month FROM advanced.analytics_function_01 3번 SELECT *, SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date, order_id ) AS cumulative_sum_by_user 4번 SELECT *, AVG(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amout ==================================================================================== 총정리 문제 1번 SELECT *, COUNT(query_date) OVER (PARTITION BY user ORDER BY user) AS total_query_cnt FROM advanced.query_logs 2번 WITH table AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, RANK() OVER(PARTITION BY week_number ORDER BY query_cnt) AS team_rank FROM table QUALIFY team_rank = 1 3번 WITH table AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM table 4번 WITH query_count_table AS ( SELECT *, COUNT(*) AS query_count FROM advanced.query_logs GROUP BY ALL ) SELECT *, SUM(query_count) OVER (PARTITION BY user ORDER BY query_date) AS cululative_query_count FROM query_count_table 5번 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL UNION ALL ) SELECT *, LAST_VALUE(number_of_order IGNORE NULLS) OVER (ORDER BY date) AS last_value_orders FROM raw_data 6번 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ), filled_data AS ( SELECT * EXCEPT(number_of_orders), **LAST_VALUE**(number_of_orders **IGNORE NULLS**) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data 7번 WITH base AS( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = "1997494153.8491999091" ), diff_data AS( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime FROM base ) ) SELECT *, SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num FROM( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 ELSE NULL END AS session_start FROM diff_data ) ORDER BY event_datetime
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
윈도우 함수(탐색 함수) 연습 문제문제 1. user들의 다음 접속 월과 다다음 접속 월을 구하는 쿼리를 작성해주세요.SELECT * , LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next1_visit_month , LEAD(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS next2_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month 문제 2. user들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구하는 쿼리를 작성해주세요.SELECT * , LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next1_visit_month , LEAD(visit_month, 2) OVER(PARTITION BY user_id ORDER BY visit_month) AS next2_visit_month , LAG(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS before_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month 문제 3. user가 접속했을 때 다음 접속까지의 간격을 구하시오.SELECT * , next_visit_month - visit_month AS diff_month FROM ( SELECT * , LEAD(visit_month, 1) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month ) AS tmp 추가 문제. 유저의 첫 번째 방문 월, 마지막 방문 월을 구하는 쿼리를 작성해주세요.SELECT * , FIRST_VALUE(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS first_visit_month , LAST_VALUE(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_visit_month FROM advanced.analytics_function_01 ORDER BY user_id, visit_month 윈도우 함수(프레임, QUALIFY) 연습 문제문제 1. -- amount_total: 전체 SUM -- cumulative_sum: row 시점에 누적 SUM -- cumulative_sum_user: row 시점에 유저별 누적 SUM -- last_5_avg: order_id 기준으로 정렬하고, 직전 5개의 주문의 평균 amount SELECT * , SUM(amount) OVER(ORDER BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS amount_total , SUM(amount) OVER(ORDER BY order_id) AS cumulative_sum , SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulative_sum_user , AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_avg FROM `inflearn-bigquery-437203.advanced.orders` ORDER BY order_id 문제 2. QUALIFY 로 윈도우 함수 조건 걸기SELECT * , SUM(amount) OVER(ORDER BY order_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS amount_total_1 , SUM(amount) OVER() AS amout_total_2 -- OVER 안에 아무것도 쓰지 않는 경우도 있다. 그러면 amount_total와 같은 결과가 계산된다. , SUM(amount) OVER(ORDER BY order_id) AS cumulative_sum , SUM(amount) OVER(PARTITION BY user_id ORDER BY order_id) AS cumulative_sum_user , AVG(amount) OVER(ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_avg FROM `inflearn-bigquery-437203.advanced.orders` QUALIFY amount_total_1 >= 500 윈도우 함수 연습 문제 (1~7번)문제 1. 사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리를 작성해주세요. 단, GROUP BY를 사용해서 집계하는 것이 아닌 query_logs의 데이터의 우측에 새로운 컬럼을 만들어주세요.SELECT * , COUNT(*) OVER(PARTITION BY user) AS total_query_cnt FROM advanced.query_logs 문제 2. 주차별로 팀 내에서 쿼리를 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결과가 보이도록 해주세요.WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number , team , user , COUNT(*) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT * , RANK() OVER(PARTITION BY week_number, team ORDER BY query_cnt DESC) AS rk FROM query_cnt_by_team QUALIFY rk = 1 -- 윈도우 함수 결과에 대해 조건 걸 때 사용 ORDER BY week_number, team, query_cnt DESC 문제 3. (2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요.WITH query_cnt_by_team AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number , team , user , COUNT(*) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT user , team , week_number , query_cnt , LAG(query_cnt, 1) OVER(PARTITION BY user ORDER BY week_number) as prev_week_query_cnt FROM query_cnt_by_team ORDER BY user, team, week_number 문제 4. 시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요.SELECT * , SUM(query_cnt) OVER(PARTITION BY user ORDER BY query_date) AS cumulative_query_cnt FROM ( SELECT * , COUNT(*) AS query_cnt FROM advanced.query_logs GROUP BY ALL) AS tmp -- 윈도우 함수 집계 함수 프레임 Default -- : RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 문제 5. 다음 데이터는 주문 횟수를 나타낸 데이터. 만약 주문 횟수가 없으면 NULL로 기록된다. 이런 데이터에서 NULL 값이라고 되어 있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요. (어려움!)WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT * , LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS last_value_orders FROM raw_data 문제 6. 5번 문제에서 NULL을 채운 후, 2일 전~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요. (이동평균)WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) , filled_data AS ( SELECT * , LAST_VALUE(number_of_orders IGNORE NULLS) OVER(ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS last_value_orders FROM raw_data ) SELECT date , last_value_orders , AVG(last_value_orders) OVER(ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data 문제 7. 세션 id 만들기 WITH step1 AS ( SELECT event_date , event_timestamp , DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime , event_name , user_id , user_pseudo_id FROM advanced.app_logs WHERE event_date = "2022-08-18" AND user_pseudo_id = "1997494153.8491999091" ) -- 이전 이벤트가 발생한 시각을 나타내는 컬럼 추가 , step2 AS ( SELECT * , LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS before_event_datetime FROM step1 ) -- 현재 이벤트와 이전 이벤트의 발생 시간의 차이를 나타내는 컬럼 추가 , step3 AS ( SELECT * , DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff FROM step2 ) -- 세션 id 부여하는 컬럼 추가 SELECT * , SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id FROM ( SELECT * , IF(second_diff IS NULL OR second_diff > 20, 1, NULL) AS session_start FROM step3 ) AS tmp
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[ 빠짝 스터디 2주차 ] 윈도우 함수 연습 문제
탐색 함수 연습 문제 -- 문제 1) user들의 다음 접속 월과 다다음 접속 월을 구하는 쿼리를 작성해주세요. -- SELECT -- user_id, -- visit_month, -- LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) as next_visit_month, -- LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) as next_next_visit_month -- FROM advanced.analytics_function_01 -- 문제 2) user들의 다음 접속 월과 다다음 접속 월, 이전 접속 월을 구하는 쿼리를 작성해주세요. -- SELECT -- user_id, -- visit_month, -- LEAD(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) as next_visit_month, -- LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) as next_next_visit_month, -- LAG(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month) as last_visit_month -- FROM advanced.analytics_function_01 -- 추가 문제 : 이 데이터셋을 기준으로 user_id의 첫 방문월, 마지막 방문월을 구하는 쿼리를 작성해주세요. SELECT DISTINCT user_id, FIRST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as first_visit_month, LAST_VALUE(visit_month) OVER (PARTITION BY user_id ORDER BY visit_month ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) as last_visit_month FROM advanced.analytics_function_01 FramePIVOT 연습 문제에서 사용한 테이블 활용 SELECT *, SUM(amount) OVER () as amount_total, SUM(amount) OVER (ORDER BY order_id) as cumluative_sum, SUM(amount) OVER (PARTITION BY user_id ORDER BY order_id) as cumluative_sum_user, AVG(amount) OVER (ORDER BY order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) as last_5_avg FROM advanced.orders ORDER BY order_id 윈도우 함수 연습문제 -- 1) 사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리를 작성해주세요. 단, GROUP BY를 사용해서 집계하는 것이 아닌 query_logs의 데이터의 우측에 새로운 컬럼을 만들어주세요. -- SELECT -- *, -- COUNT(*) OVER (PARTITION BY user) as total_query_count -- FROM advanced.query_logs -- 2) 주차별로 팀 내에서 쿼리를 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결과가 보이도록 해주세요 -- SELECT -- DISTINCT *, -- RANK() OVER (PARTITION BY week_number, team ORDER BY query_cnt desc) as team_rank -- FROM -- ( -- SELECT -- week_number, -- team, -- user, -- COUNT(query_date) OVER (PARTITION BY user, week_number) as query_cnt -- FROM -- (SELECT -- *, -- EXTRACT(WEEK FROM query_date) as week_number -- FROM advanced.query_logs -- ) -- ) -- QUALIFY team_rank = 1 -- ORDER BY week_number, team -- 문제 의도 : 원본 데이터는 row 마다 데이터가 있고, 그걸 집계해서 활용. GROUP BY 사용 후에 윈도우 함수 -- 3) (2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요 -- SELECT -- *, -- LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) as prev_week_query_count -- FROM -- (SELECT -- distinct week_number, -- team, -- user, -- COUNT(query_date) OVER (PARTITION BY user, week_number) as query_cnt -- FROM -- (SELECT -- *, -- EXTRACT(WEEK FROM query_date) as week_number -- FROM advanced.query_logs -- ) -- ) -- ORDER BY user, week_number -- 4) 시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요 -- SELECT -- distinct *, -- COUNT(query_date) OVER (PARTITION BY user, query_date ORDER BY query_date) as query_cnt, -- COUNT(query_date) OVER (PARTITION BY user ORDER BY query_date) as cumulative_query_cnt -- FROM advanced.query_logs -- ORDER BY user, query_date -- 출제 의도 : Default Frame 이해 // 집계분석에서 Frame의 Default 값 = BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW -- 5) 다음 데이터는 주문 횟수를 나타낸 데이터입니다. 만약 주문 횟수가 없으면 NULL로 기록됩니다. 이런 데이터에서 NULL 값이라고 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요 -- WITH raw_data AS ( -- SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL -- SELECT DATE '2024-05-03', NULL UNION ALL -- SELECT DATE '2024-05-04', 16 UNION ALL -- SELECT DATE '2024-05-05', NULL UNION ALL -- SELECT DATE '2024-05-06', 18 UNION ALL -- SELECT DATE '2024-05-07', 20 UNION ALL -- SELECT DATE '2024-05-08', NULL UNION ALL -- SELECT DATE '2024-05-09', 13 UNION ALL -- SELECT DATE '2024-05-10', 14 UNION ALL -- SELECT DATE '2024-05-11', NULL UNION ALL -- SELECT DATE '2024-05-12', NULL -- ) -- SELECT -- date, -- CASE WHEN number_of_orders is not null THEN number_of_orders -- ELSE LAG(number_of_orders) OVER (ORDER BY date) -- END AS number_of_orders -- FROM raw_data -- 수정 -- SELECT -- *, -- LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) as last_value_orders -- FROM raw_data -- 출제 의도 : first_value, last_value 사용할 때 null을 포함하고 싶으면 ignore nulls 활용 -- 6) 5번 문제에서 NULL을 채운 후, 2일 전 ~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요(이동 평균) -- WITH raw_data AS ( -- SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL -- SELECT DATE '2024-05-03', NULL UNION ALL -- SELECT DATE '2024-05-04', 16 UNION ALL -- SELECT DATE '2024-05-05', NULL UNION ALL -- SELECT DATE '2024-05-06', 18 UNION ALL -- SELECT DATE '2024-05-07', 20 UNION ALL -- SELECT DATE '2024-05-08', NULL UNION ALL -- SELECT DATE '2024-05-09', 13 UNION ALL -- SELECT DATE '2024-05-10', 14 UNION ALL -- SELECT DATE '2024-05-11', NULL UNION ALL -- SELECT DATE '2024-05-12', NULL -- ) -- SELECT -- *, -- AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) as moving_avg -- FROM -- (SELECT -- date, -- CASE WHEN number_of_orders is not null THEN number_of_orders -- ELSE LAG(number_of_orders) OVER (ORDER BY date) -- END AS number_of_orders -- FROM raw_data -- ) -- 7) app_logs 테이블에서 Custom Session을 만들어 주세요. 이전 이벤트 로그와 20초가 지나면 새로운 Session을 만들어 주세요. Session은 숫자로 (1, 2, 3 ...) 표시해도 됩니다 -- 2022-08-18일의 user_pseudo_id(1997494153.8491999091)은 session_id가 4까지 나옵니다 WITH base AS ( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') as event_datetime, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' and user_pseudo_id = '1997494153.8491999091' ), diff_data AS ( SELECT *, DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) as second_diff FROM ( SELECT *, LAG(event_datetime) OVER (PARTITION BY user_pseudo_id ORDER BY base.event_datetime) as prev_event_datetime FROM base ) ) SELECT *, SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num -- session 을 구할 때 쿼리가 길어질 수 있음. 하루에 접속을 여러번 하는 서비스 -> session 기반이 좋을 수 있고, 아니라면 일자별 유저 집계가 나을 수 있음 FROM ( SELECT *, CASE WHEN second_diff IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 #세션을 나누는 기준. 데이터 탐색 후 결정 / 보통 앱 로그는 30초 or 60초 ELSE 0 END as session_start FROM diff_data ) ORDER BY event_datetime -- 세션 정리 -- 이전 이벤트 로그와 현재 이벤트 로그의 diff => 초나 분을 구한다 -- 기준을 가지고 그 기준 보다 높으면 새로운 세션이라고 한다 -- 첫번째 값엔 null이 있을 수 있어, 이 부분도 챙겨야 한다 -- 새로운 세션, session_start 기반으로 누적합 => session_num이 된다
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습 문제
문제 1번SELECT user , team , query_date , COUNT(query_date) OVER (PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY user, query_date; 문제 2번WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number , user , team , COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT week_number , team , user , total_query_cnt , RANK() OVER (PARTITION BY team ORDER BY total_query_cnt DESC) AS ranking_in_team FROM base QUALIFY ranking_in_team = 1 ORDER BY week_number, team; 문제 3번WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number , user , team , COUNT(query_date) AS query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT user , team , week_number , query_cnt , LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM base ORDER BY user, week_number; 문제 4번SELECT user , team , query_date , query_count , SUM(query_count) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count FROM ( SELECT user , team , query_date , COUNT(query_date) AS query_count FROM advanced.query_logs GROUP BY 1,2,3 ) ORDER BY user, query_date; 문제 5번WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data; 문제 6번WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , number_of_orders , AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM ( SELECT date , LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) 문제 7번WITH base AS ( SELECT event_date , event_timestamp , DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul') AS event_datetime , event_name , user_id , user_pseudo_id , LAG(DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul')) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS before_event_datetime FROM advanced.app_logs WHERE 1=1 AND event_date = '2022-08-18' ) SELECT * ,DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff , CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL OR DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END AS session_start , SUM(CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) + 1 AS session_temp FROM base
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 탐색 함수 연습 문제, 윈도우 함수 FRAME 연습 문제, 윈도우 함수 총정리 연습 문제
<FRAME 문제> 1번문제: 총 수량(amount_total), 수량의 누적 합(cumulativ_sum), 유저별 수량의 누적 합(cumulative_sum(user)), 최근 5개 수량의 평균(last_5_avg) 출력쿼리를 작성하는 목표, 확인할 지표: 수량의 총합 또는 누적 합 구하기쿼리 계산 방법: 윈도우 함수 - AVG, SUM데이터의 기간: X사용할 테이블: advanced.ordersJOIN KEY: X데이터 특징: XSELECT order_id, -- 각 주문의 고유 식별자 order_date, -- 주문 날짜 user_id, -- 주문한 사용자 ID amount, -- 주문 금액 SUM(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS amount_total, -- 주문 데이터 전체의 총 금액을 계산합니다. -- 'ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING'은 모든 행을 참조한다는 의미로, 테이블의 전체 합계를 계산합니다. SUM(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum, -- 날짜와 주문 ID 순서로 누적 합계를 계산합니다. -- 'ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW'는 현재 행까지의 모든 이전 행을 포함하여 누적 합계를 계산합니다. SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date, order_id ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_sum_by_user, -- 사용자별로 누적 합계를 계산합니다. -- 'PARTITION BY user_id'는 각 사용자별로 데이터를 분리해 누적 합계를 계산하도록 합니다. -- 즉, 각 사용자가 주문한 금액의 누적 합계를 구합니다. AVG(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amount -- 현재 행을 기준으로 바로 이전 5개의 행의 주문 금액 평균을 계산합니다. -- 'ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING'은 현재 행 이전의 5개 행에서 1개 행까지 포함하여 평균을 구합니다. -- 이를 통해 최근 5건의 주문에 대한 평균 금액을 알 수 있습니다. FROM advanced.orders -- advanced 데이터셋의 orders 테이블에서 데이터를 가져옵니다. --QUALIFY last_5_orders_avg_amount >= 150 -- 마지막 5건의 주문 평균 금액이 150 이상인 주문만을 결과로 필터링하기 위한 조건입니다. 현재는 주석 처리되어 있어 실행되지 않습니다. ORDER BY order_id; -- 주문 ID 기준으로 결과를 정렬하여 각 주문에 대해 계산된 값을 순서대로 확인합니다. <총 정리 문제> 1번문제:사용자별 쿼리를 실행한 총 횟수를 구하는 쿼리를 작성해주세요. 우측에 새로운 컬럼을 만들어주세요. SELECT * -- 모든 열을 선택합니다. advanced.query_logs 테이블의 모든 컬럼을 그대로 포함합니다. , COUNT(user) OVER (PARTITION BY user) AS total_query_cnt -- 'user'별로 쿼리 실행 횟수를 계산합니다. -- 'PARTITION BY user'를 사용하여 각 사용자별로 나누고, 그 안에서 해당 사용자가 몇 번의 쿼리를 실행했는지 계산합니다. -- 이 값은 각 사용자가 테이블에 몇 번 등장했는지를 의미합니다. FROM advanced.query_logs -- advanced 데이터셋의 query_logs 테이블에서 데이터를 가져옵니다. ORDER BY query_date, user; -- 'query_date'와 'user'를 기준으로 데이터를 정렬합니다. 'query_date'를 우선적으로, 같은 날짜 내에서는 'user'를 기준으로 정렬합니다. 2번문제: 주차별로 팀 내에서 쿼리를 많이 실행한 수를 구한 후, 실행한 수를 활용해 랭킹을 구해주세요. 단, 랭킹이 1등인 사람만 결곽가 보이도록 해주세요.SELECT * -- 모든 열을 선택합니다. 서브쿼리에서 생성된 모든 컬럼을 포함합니다. , RANK() OVER (PARTITION BY week_number, team ORDER BY query_cnt DESC) AS team_rank -- RANK() 함수를 사용하여 각 팀 내에서 쿼리 실행 횟수(query_cnt)에 따라 랭킹을 부여합니다. -- 'PARTITION BY week_number, team'은 팀과 주차별로 데이터를 나누어 그 안에서 랭킹을 계산합니다. -- 'ORDER BY query_cnt DESC'는 쿼리 실행 횟수에 따라 랭킹을 내림차순으로 정렬합니다. -- 이 결과로 각 팀 내에서 쿼리 실행 횟수에 따라 랭킹이 부여된 'team_rank'라는 새로운 열이 추가됩니다. FROM (SELECT EXTRACT(WEEK FROM query_date) AS week_number -- query_date에서 주차를 추출하여 'week_number' 열을 만듭니다. , team -- 팀을 나타내는 컬럼을 선택합니다. , user -- 사용자를 나타내는 컬럼을 선택합니다. , COUNT(user) AS query_cnt -- 사용자가 쿼리를 실행한 횟수를 집계하여 'query_cnt'라는 열로 만듭니다. FROM advanced.query_logs -- advanced 데이터셋의 query_logs 테이블에서 데이터를 가져옵니다. GROUP BY ALL) -- 모든 선택된 컬럼을 그룹화합니다. 여기서는 'week_number', 'team', 'user'로 데이터를 그룹화합니다. QUALIFY team_rank = 1 -- QUALIFY 절은 필터링 조건을 적용하는 역할을 하며, 여기서는 RANK가 1인 경우만 선택합니다. -- 즉, 각 팀 내에서 쿼리 실행 횟수가 가장 많은 사용자(1등)만 필터링됩니다. ORDER BY week_number, team; -- 주차별로, 그리고 각 주차 내에서 팀별로 데이터를 정렬합니다. 3번문제: (2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 기준 1주 전에 쿼리 실행 수를 별도의 컬럼으로 확인할 수 있는 쿼리를 작성해주세요.SELECT * -- 모든 열을 선택합니다. 서브쿼리에서 생성된 모든 컬럼을 포함합니다. , LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_count -- LAG() 함수를 사용하여 이전 주차(week)의 쿼리 실행 수를 가져옵니다. -- 'PARTITION BY user'를 통해 각 사용자별로 데이터를 나누고, 각 사용자에 대해 'week_number' 순서로 정렬하여 이전 주차의 값을 가져옵니다. -- 결과적으로, 이전 주차의 쿼리 실행 횟수를 'prev_week_query_count'라는 새로운 열로 추가합니다. FROM (SELECT user -- 쿼리를 실행한 사용자를 나타내는 열입니다. , team -- 사용자가 속한 팀을 나타내는 열입니다. , EXTRACT(WEEK FROM query_date) AS week_number -- query_date에서 주차(week)를 추출하여 'week_number'라는 열로 만듭니다. , COUNT(user) AS query_cnt -- 사용자가 실행한 쿼리 횟수를 집계하여 'query_cnt' 열로 만듭니다. FROM advanced.query_logs -- advanced 데이터셋의 query_logs 테이블에서 데이터를 가져옵니다. GROUP BY ALL) -- 선택된 모든 열을 그룹화합니다. ORDER BY user, week_number; -- 최종 결과를 사용자별로 정렬하고, 그다음 각 사용자 내에서 주차별로 정렬합니다. 4번문제:시간의 흐름에 따라, 일자별로 유저가 실행한 누적 쿼리 수를 작성해주세요.SELECT * -- 모든 열을 선택합니다. 서브쿼리에서 생성된 모든 컬럼을 포함합니다. , SUM(query_cnt) OVER (PARTITION BY user ORDER BY query_date) AS cumulative_query_count -- SUM 윈도우 함수를 사용하여 누적 쿼리 실행 수를 계산합니다. -- 'PARTITION BY user'를 통해 각 사용자별로 데이터를 나누고, 각 사용자에 대해 'query_date' 순서로 정렬하여 누적 값을 계산합니다. -- 결과적으로, 각 사용자에 대한 누적 쿼리 실행 횟수가 'cumulative_query_count'라는 새로운 열로 추가됩니다. FROM (SELECT user -- 쿼리를 실행한 사용자를 나타내는 열입니다. , team -- 사용자가 속한 팀을 나타내는 열입니다. , query_date -- 쿼리가 실행된 날짜를 나타내는 열입니다. , COUNT(user) AS query_cnt -- 사용자가 쿼리를 실행한 횟수를 집계하여 'query_cnt' 열로 만듭니다. FROM advanced.query_logs -- advanced 데이터셋의 query_logs 테이블에서 데이터를 가져옵니다. GROUP BY ALL) -- 선택된 모든 열을 그룹화하여 중복되지 않게 집계합니다. ORDER BY user, query_date; -- 최종 결과를 사용자별로 정렬하고, 그다음 각 사용자의 쿼리 실행 날짜별로 정렬합니다. 5번문제:다음 데이터는 주문 횟수를 나타낸 데이터입니다. 만약 주문 횟수가 없으면 NULL로 기록됩니다. 이런 데이터에서 NULL 값이라고 되어있는 부분을 바로 이전 날짜의 값으로 채워주는 쿼리를 작성해주세요.WITH raw_data AS ( -- 날짜와 해당 날짜의 주문 수를 포함한 임시 테이블을 생성합니다. SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) -- 임시 데이터를 활용하여 쿼리의 기능을 확인하기 위한 서브쿼리입니다. SELECT * -- 모든 열을 선택합니다 (날짜와 주문 수). , LAST_VALUE(number_of_orders IGNORE NULLS) OVER (ORDER BY date) AS number_of_orders -- NULL 값을 제외하고 마지막으로 발견된 'number_of_orders' 값을 가져옵니다. -- 'ORDER BY date'를 사용하여 날짜 순서대로 정렬하며, -- 각 행에서 가장 최근의 유효한 (NULL이 아닌) 'number_of_orders' 값을 반환합니다. FROM raw_data -- 데이터가 저장된 'raw_data' 임시 테이블에서 데이터를 조회합니다. ORDER BY date; -- 최종 결과를 날짜 순서대로 정렬하여 출력합니다. 6번문제: 5번 문제에서 NULL을 채운 후, 2일 전 ~ 현재 데이터의 평균을 구하는 쿼리를 작성해주세요.(이동 평균)WITH raw_data AS ( -- 임시 데이터를 정의합니다. 날짜(date)와 해당 날짜의 주문 수(number_of_orders)를 포함한 테이블을 생성합니다. SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) -- 누락된 데이터를 보완하고, 이동 평균을 계산하는 쿼리를 작성합니다. SELECT * -- 모든 열을 선택한 뒤, 이동 평균(moving_avg)을 추가로 계산합니다. , ROUND(AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW), 1) AS moving_avg -- AVG 함수를 사용하여 현재 행을 포함해 이전 두 행의 값에 대한 이동 평균을 계산합니다. -- 계산된 결과를 소수점 첫 번째 자리까지 반올림합니다. FROM ( SELECT date, -- IFNULL 함수를 사용하여 NULL 값을 보완합니다. -- 이전 날짜의 주문 수를 가져와 현재 행의 주문 수가 NULL인 경우 보완합니다. IFNULL(number_of_orders, LAG(number_of_orders) OVER (ORDER BY date)) AS number_of_orders FROM raw_data ) -- 최종 결과를 날짜 순서로 정렬하여 출력합니다. ORDER BY date; 7번문제: app_logs 테이블에서 Custom Session을 만들어 주세요. 이전 이벤트 로그와 20초가 지나면 새로운 Session을 만들어 주세요. Session은 숫자로 (1, 2, 3 …) 표시해도 됩니다. 2022-08-18일의 user_pseudo_id(1997494153.8491999091)은 session_id가 4까지 나옵니다.WITH base AS ( -- Step 1. 기초 데이터 추출 및 이전 이벤트 시간 계산 SELECT event_date, -- 이벤트가 발생한 날짜 event_timestamp, -- 마이크로초 단위의 이벤트 발생 시각 DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, -- 이벤트 발생 시각을 'Asia/Seoul' 시간대로 변환하여 가독성을 높임 event_name, -- 이벤트의 이름 user_id, -- 사용자 ID user_pseudo_id, -- 사용자 고유의 익명화된 ID DATETIME(TIMESTAMP_MICROS(LAG(event_timestamp) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp)), 'Asia/Seoul') AS before_event_datetime -- 이전 이벤트의 발생 시각을 'Asia/Seoul' 시간대로 변환 -- LAG() 함수를 사용해 각 사용자의 이전 이벤트 발생 시각을 가져옴 FROM advanced.app_logs WHERE event_date = '2022-08-18' -- 특정 날짜의 이벤트만 필터링 AND user_pseudo_id = '1997494153.8491999091' -- 특정 사용자에 대한 로그만 조회 ), -- Step 2. 세션 유지 시간 및 신규 세션 여부 계산 session_info AS ( SELECT *, TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff, -- 이전 이벤트와 현재 이벤트 사이의 시간 차이를 초 단위로 계산 CASE WHEN TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 OR TIMESTAMP_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL THEN 1 -- 두 이벤트 사이의 시간 차이가 20초 이상이거나 이전 이벤트가 없는 경우 새로운 세션 시작으로 간주 ELSE NULL END AS session_start -- 새로운 세션이 시작되면 1, 그렇지 않으면 NULL FROM base ) -- Step 3. 신규 세션 ID 세팅 SELECT *, SUM(session_start) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_id -- 각 사용자의 이벤트 시퀀스에서 새로운 세션이 시작될 때마다 session_start 값을 누적하여 세션 ID를 부여 FROM session_info ORDER BY event_date, event_timestamp; -- 이벤트 발생 시각을 기준으로 정렬하여 결과를 출력
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
[빠짝스터디 2주차 과제] 윈도우 함수 연습문제
문제 1번-- 1. 사용자별 쿼리를 실행한 횟수의 총합 -- GROUP BY 사용 X. 우측에 새로운 컬럼 생성하기. SELECT user , team , query_date , COUNT(query_date) OVER (PARTITION BY user) AS total_query_cnt FROM advanced.query_logs ORDER BY user, query_date; 문제 2번-- 2. 주차별로 팀내에서 쿼리를 많이 실행한 수 → 랭킹 구하기 -- 팀별로 랭킹이 1위인 사람만 출력하기 WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number , user , team , COUNT(query_date) AS total_query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT week_number , team , user , total_query_cnt , RANK() OVER (PARTITION BY team ORDER BY total_query_cnt DESC) AS ranking_in_team FROM base QUALIFY ranking_in_team = 1 ORDER BY week_number, team; 문제 3번-- 3. (2번 문제에서 사용한 주차별 쿼리 사용) 쿼리를 실행한 시점 -- 1주 전에 쿼리를 실행한 횟수를 별도의 컬럼으로 출력 WITH base AS ( SELECT EXTRACT(week FROM query_date) AS week_number , user , team , COUNT(query_date) AS query_cnt FROM advanced.query_logs GROUP BY week_number, user, team ) SELECT user , team , week_number , query_cnt , LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM base ORDER BY user, week_number; 문제 4번-- 4. 일자별 유저가 쿼리한 횟수 누적합 SELECT user , team , query_date , query_count , SUM(query_count) OVER (PARTITION BY user ORDER BY query_date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_query_count FROM ( SELECT user , team , query_date , COUNT(query_date) AS query_count FROM advanced.query_logs GROUP BY 1,2,3 ) ORDER BY user, query_date; 문제 5번-- 주문 횟수 테이블: 데이터 없으면 NULL로 표기됨. -- 5. NULL값을 바로 전달 데이터로 채우기 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data; 문제 6번-- 6. (5번 완료 후) 2일 전 ~ 현재 데이터의 평균 구하기 WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07', 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ) SELECT date , number_of_orders , AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM ( SELECT date , LAST_VALUE(number_of_orders ignore nulls) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) 문제 7번-- 7. app_logs 테이블에서 커스텀 세션 추가 -- 이전 이벤트 로그와 20초 이상 차이가 나면 새로운 세션으로 정의 -- 세션은 숫자로(1,2,3...) 표시 가능 WITH base AS ( SELECT event_date , event_timestamp , DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul') AS event_datetime , event_name , user_id , user_pseudo_id , LAG(DATETIME(TIMESTAMP_MICROS(event_timestamp),'Asia/Seoul')) OVER (PARTITION BY user_pseudo_id ORDER BY event_timestamp) AS before_event_datetime FROM advanced.app_logs WHERE 1=1 AND event_date = '2022-08-18' ) SELECT * ,DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) AS second_diff , CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) IS NULL OR DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END AS session_start , SUM(CASE WHEN DATETIME_DIFF(event_datetime, before_event_datetime, SECOND) >= 20 THEN 1 ELSE 0 END) OVER (PARTITION BY user_pseudo_id ORDER BY event_datetime) + 1 AS session_temp FROM base
-
미해결BigQuery(SQL) 활용편(퍼널 분석, 리텐션 분석)
2주차 과제 제출
문제 1.SELECT *, LEAD(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS nnext_visit_month FROM advanced.analytics_function_01문제2.SELECT *, LEAD(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS next_visit_month, LEAD(visit_month, 2) OVER (PARTITION BY user_id ORDER BY visit_month) AS nnext_visit_month, LAG(visit_month) OVER(PARTITION BY user_id ORDER BY visit_month) AS previous_visit_month FROM advanced.analytics_function_01문제3.SELECT *, SUM(amount) OVER (PARTITION BY user_id ORDER BY order_date, order_id ) AS cumulative_sum_by_user문제 4.SELECT *, AVG(amount) OVER (ORDER BY order_date, order_id ROWS BETWEEN 5 PRECEDING AND 1 PRECEDING) AS last_5_orders_avg_amout 총정리 연습문제문제 1.SELECT *, COUNT(query_date) OVER (PARTITION BY user ORDER BY user) AS total_query_cnt FROM advanced.query_logs문제2.WITH table AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, RANK() OVER(PARTITION BY week_number ORDER BY query_cnt) AS team_rank FROM table QUALIFY team_rank = 1문제3.WITH table AS ( SELECT EXTRACT(WEEK FROM query_date) AS week_number, team, user, COUNT(user) AS query_cnt FROM advanced.query_logs GROUP BY ALL ) SELECT *, LAG(query_cnt) OVER (PARTITION BY user ORDER BY week_number) AS prev_week_query_cnt FROM table문제4.WITH query_count_table AS ( SELECT *, COUNT(*) AS query_count FROM advanced.query_logs GROUP BY ALL ) SELECT *, SUM(query_count) OVER (PARTITION BY user ORDER BY query_date) AS cululative_query_count FROM query_count_table문제5.WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL UNION ALL ) SELECT *, LAST_VALUE(number_of_order IGNORE NULLS) OVER (ORDER BY date) AS last_value_orders FROM raw_data문제6.WITH raw_data AS ( SELECT DATE '2024-05-01' AS date, 15 AS number_of_orders UNION ALL SELECT DATE '2024-05-02', 13 UNION ALL SELECT DATE '2024-05-03', NULL UNION ALL SELECT DATE '2024-05-04', 16 UNION ALL SELECT DATE '2024-05-05', NULL UNION ALL SELECT DATE '2024-05-06', 18 UNION ALL SELECT DATE '2024-05-07' , 20 UNION ALL SELECT DATE '2024-05-08', NULL UNION ALL SELECT DATE '2024-05-09', 13 UNION ALL SELECT DATE '2024-05-10', 14 UNION ALL SELECT DATE '2024-05-11', NULL UNION ALL SELECT DATE '2024-05-12', NULL ), filled_data AS ( SELECT * EXCEPT(number_of_orders), **LAST_VALUE**(number_of_orders **IGNORE NULLS**) OVER (ORDER BY date) AS number_of_orders FROM raw_data ) SELECT *, AVG(number_of_orders) OVER (ORDER BY date ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS moving_avg FROM filled_data문제 7.WITH base AS( SELECT event_date, DATETIME(TIMESTAMP_MICROS(event_timestamp), 'Asia/Seoul') AS event_datetime, event_name, user_id, user_pseudo_id FROM advanced.app_logs WHERE event_date = '2022-08-18' AND user_pseudo_id = "1997494153.8491999091" ), diff_data AS( SELECT *, #이전 이벤트 시간과 현재 이벤트시간의 간격을 SECOND 초단위로 구하기 / second_diff를 기반으로 새로운 세션의 시작일지 아닐지 판단 DATETIME_DIFF(event_datetime, prev_event_datetime, SECOND) AS second_diff FROM ( SELECT *, # 직전 이벤트 시간을 prev_event_datetime으로 만들기 LAG(event_datetime, 1) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS prev_event_datetime FROM base ) ) SELECT *, SUM(session_start) OVER(PARTITION BY user_pseudo_id ORDER BY event_datetime) AS session_num FROM( SELECT *, CASE WHEN prev_event_datetime IS NULL THEN 1 WHEN second_diff >= 20 THEN 1 ELSE NULL END AS session_start FROM diff_data ) ORDER BY event_datetime