Wednesday, August 13, 2025

Big Data SQL challenging Questions with Solutions

 

1.⁠ ⁠Write a SQL query to get the daily count of active users (logged in at least once).

>SELECT
DATE(login_time) AS login_date,
COUNT(DISTINCT user_id) AS active_user_count
FROM
user_logins
GROUP BY
DATE(login_time)
ORDER BY
login_date;

2.⁠ ⁠Find the 2nd highest transaction per user without using LIMIT or TOP.

>SELECT
t1.user_id,
t1.transaction_amount AS second_highest_transaction
FROM
transactions t1
WHERE
2 = (
SELECT COUNT(DISTINCT t2.transaction_amount)
FROM transactions t2
WHERE t2.user_id = t1.user_id
AND t2.transaction_amount >= t1.transaction_amount
);


3.⁠ ⁠Identify data gaps in time-series event logs (e.g., missing hourly records).

>WITH expected_hours AS (
SELECT generate_series(
(SELECT MIN(date_trunc('hour', event_time)) FROM event_logs),
(SELECT MAX(date_trunc('hour', event_time)) FROM event_logs),
INTERVAL '1 hour'
) AS expected_time
)
SELECT
expected_time
FROM
expected_hours e
LEFT JOIN (
SELECT DISTINCT date_trunc('hour', event_time) AS logged_hour
FROM event_logs
) a ON e.expected_time = a.logged_hour
WHERE
a.logged_hour IS NULL
ORDER BY
expected_time;

4.⁠ ⁠Fetch the first purchase date per user and calculate days since then.

>SELECT
user_id,
MIN(purchase_date) AS first_purchase_date,
CURRENT_DATE - MIN(purchase_date) AS days_since_first_purchase
FROM
purchases
GROUP BY
user_id;


5.⁠ ⁠Detect schema changes in SCD Type 2 tables using Delta Lake.


6.⁠ ⁠Join product and transaction tables and filter out null foreign keys safely.

>SELECT 
t.transaction_id,
t.product_id,
t.transaction_date,
p.product_name
FROM
transaction t
JOIN
product p ON t.product_id = p.product_id
WHERE
t.product_id IS NOT NULL;



7.⁠ ⁠Get users who upgraded to premium within 7 days of signup.

> SELECT 
user_id,
signup_date,
premium_upgrade_date
FROM
users
WHERE
premium_upgrade_date IS NOT NULL
AND premium_upgrade_date <= signup_date + INTERVAL '7 days';


8.⁠ ⁠Calculate cumulative distinct product purchases per customer.

> WITH unique_products AS (
SELECT DISTINCT customer_id, product_id, purchase_date
FROM purchases
),
ranked AS (
SELECT
customer_id,
product_id,
purchase_date,
ROW_NUMBER() OVER (PARTITION BY customer_id ORDER BY purchase_date) AS rn
FROM unique_products
),
cumulative AS (
SELECT
r1.customer_id,
r1.purchase_date,
r1.product_id,
(
SELECT COUNT(DISTINCT r2.product_id)
FROM ranked r2
WHERE r2.customer_id = r1.customer_id AND r2.purchase_date <= r1.purchase_date
) AS cumulative_distinct_products
FROM ranked r1
)
SELECT * FROM cumulative
ORDER BY customer_id, purchase_date;


9.⁠ ⁠Retrieve customers who spent above average in their region.

> WITH customer_spend AS (
SELECT
customer_id,
region,
SUM(amount) AS total_spend
FROM
transactions
GROUP BY
customer_id, region
),
region_avg AS (
SELECT
region,
AVG(total_spend) AS avg_region_spend
FROM
customer_spend
GROUP BY region
)
SELECT
c.customer_id,
c.region,
c.total_spend
FROM
customer_spend c
JOIN
region_avg r ON c.region = r.region
WHERE
c.total_spend > r.avg_region_spend;


10.⁠ ⁠Find duplicate rows in an ingestion table (based on all columns).

>SELECT 
id, name, email, created_at,
COUNT(*) AS duplicate_count
FROM
ingestion_table
GROUP BY
id, name, email, created_at
HAVING
COUNT(*) > 1;

OR

> SELECT *
FROM ingestion_table
WHERE (id, name, email, created_at) IN (
SELECT
id, name, email, created_at
FROM ingestion_table
GROUP BY id, name, email, created_at
HAVING COUNT(*) > 1
);

11.⁠ ⁠Compute daily revenue growth % using lag window function.

>SELECT
revenue_date,
total_revenue,
LAG(total_revenue) OVER (ORDER BY revenue_date) AS previous_day_revenue,
ROUND(
(total_revenue - LAG(total_revenue) OVER (ORDER BY revenue_date))
/ NULLIF(LAG(total_revenue) OVER (ORDER BY revenue_date), 0) * 100,
2
) AS revenue_growth_percent
FROM
daily_revenue
ORDER BY
revenue_date;


12.⁠ ⁠Identify products with declining sales 3 months in a row.

>WITH sales_with_lags AS (
SELECT
product_id,
sale_month,
monthly_sales,
LAG(monthly_sales, 1) OVER (PARTITION BY product_id ORDER BY sale_month) AS prev_month_sales,
LAG(monthly_sales, 2) OVER (PARTITION BY product_id ORDER BY sale_month) AS prev_2_month_sales
FROM
monthly_sales
),
declining_products AS (
SELECT
product_id,
sale_month,
monthly_sales,
prev_month_sales,
prev_2_month_sales
FROM
sales_with_lags
WHERE
monthly_sales < prev_month_sales
AND prev_month_sales < prev_2_month_sales
)
SELECT DISTINCT product_id
FROM declining_products;


13.⁠ ⁠Get users with at least 3 logins per week over last 2 months.

> WITH login_data AS (
SELECT
user_id,
DATE_TRUNC('week', login_time) AS week_start
FROM
user_logins
WHERE
login_time >= CURRENT_DATE - INTERVAL '2 months'
),
weekly_login_counts AS (
SELECT
user_id,
week_start,
COUNT(*) AS weekly_logins
FROM
login_data
GROUP BY
user_id, week_start
),
active_weeks AS (
SELECT
user_id,
COUNT(*) AS weeks_with_3_or_more_logins
FROM
weekly_login_counts
WHERE
weekly_logins >= 3
GROUP BY
user_id
),
weeks_total AS (
SELECT
COUNT(DISTINCT DATE_TRUNC('week', login_time)) AS total_weeks
FROM
user_logins
WHERE
login_time >= CURRENT_DATE - INTERVAL '2 months'
)
SELECT
a.user_id
FROM
active_weeks a
CROSS JOIN
weeks_total w
WHERE
a.weeks_with_3_or_more_logins = w.total_weeks;


14.⁠ ⁠Rank users by frequency of login in the current quarter.

WITH current_quarter_logins AS (
SELECT
user_id
FROM
user_logins
WHERE
DATE_TRUNC('quarter', login_time) = DATE_TRUNC('quarter', CURRENT_DATE)
),
login_counts AS (
SELECT
user_id,
COUNT(*) AS login_count
FROM
current_quarter_logins
GROUP BY
user_id
),
ranked_users AS (
SELECT
user_id,
login_count,
RANK() OVER (ORDER BY login_count DESC) AS login_rank
FROM
login_counts
)
SELECT * FROM ranked_users
ORDER BY login_rank;


15.⁠ ⁠Fetch users who purchased same product multiple times in one day.

SELECT
user_id,
product_id,
DATE(purchase_time) AS purchase_date,
COUNT(*) AS purchase_count
FROM
purchases
GROUP BY
user_id,
product_id,
DATE(purchase_time)
HAVING
COUNT(*) > 1
ORDER BY
user_id, purchase_date;


16.⁠ ⁠Detect and delete late-arriving data for current month partitions.

> SELECT *
FROM events
WHERE
partition_month = DATE_TRUNC('month', CURRENT_DATE)
AND DATE_TRUNC('month', event_time) <> partition_month;

>DELETE FROM events
WHERE
partition_month = DATE_TRUNC('month', CURRENT_DATE)
AND DATE_TRUNC('month', event_time) <> partition_month;

17.⁠ ⁠Get top 5 products by profit margin across all categories.

> SELECT 
product_id,
category_id,
selling_price,
cost_price,
ROUND((selling_price - cost_price) / selling_price * 100, 2) AS profit_margin_percent
FROM
products
WHERE
selling_price > 0 -- to avoid divide-by-zero
ORDER BY
profit_margin_percent DESC
LIMIT 5;

18.⁠ ⁠Compare rolling 30-day revenue vs previous 30-day window.

> WITH daily_revenue AS (
SELECT
sale_date::date AS day,
SUM(revenue) AS daily_revenue
FROM
sales
GROUP BY
sale_date::date
),
rolling_30 AS (
SELECT
day,
SUM(daily_rerevenue) OVER (
ORDER BY day
ROWS BETWEEN 29 PRECEDING AND CURRENT ROW
) AS rolling_30_day_revenue
FROM
daily_revenue
),
comparison AS (
SELECT
day,
rolling_30_day_revenue,
LAG(rolling_30_day_revenue) OVER (ORDER BY day) AS previous_30_day_revenue
FROM
rolling_30
)
SELECT
day,
rolling_30_day_revenue,
previous_30_day_revenue,
ROUND(
(rolling_30_day_revenue - previous_30_day_revenue) /
NULLIF(previous_30_day_revenue, 0) * 100, 2
) AS revenue_change_percent
FROM
comparison
WHERE
previous_30_day_revenue IS NOT NULL
ORDER BY
day;

19.⁠ ⁠Flag transactions happening outside business hours.

>SELECT 
transaction_id,
transaction_time,
CASE
WHEN CAST(transaction_time AS TIME) < TIME '09:00:00'
OR CAST(transaction_time AS TIME) > TIME '18:00:00'
THEN 'Outside Business Hours'
ELSE 'Within Business Hours'
END AS business_hours_flag
FROM
transactions;

20.⁠ ⁠Write an optimized SQL query using broadcast join hints for small lookup tables.

> SELECT /*+ BROADCAST(lkp) */ 
f.order_id,
f.customer_id,
lkp.region,
SUM(f.amount) AS revenue
FROM fact_orders f
JOIN small_customer_lookup lkp
ON f.customer_id = lkp.customer_id
WHERE f.order_date BETWEEN '2025-01-01' AND '2025-06-30'
GROUP BY f.order_id, f.customer_id, lkp.region;

No comments:

Post a Comment