Streamlining global layoff data using MySQL 8.0+ (CTEs, window functions, regex, generated columns, constraints).
SQL Code
/* ===============================
ADVANCED DATA CLEANING (MySQL 8+)
=============================== */
/* 0) Create staging and add a surrogate key for safe de-dup */
DROP TABLE IF EXISTS layoffs_staging;
CREATE TABLE layoffs_staging LIKE layoffs;
INSERT INTO layoffs_staging SELECT * FROM layoffs;
ALTER TABLE layoffs_staging
ADD COLUMN _row_id BIGINT NOT NULL AUTO_INCREMENT PRIMARY KEY;
/* 1) Canonical text cleanup using REGEXP_REPLACE + TRIM */
UPDATE layoffs_staging
SET
company = REGEXP_REPLACE(TRIM(company), '\\s+', ' '),
location = REGEXP_REPLACE(TRIM(location), '\\s+', ' '),
industry = NULLIF(TRIM(industry), ''),
stage = NULLIF(TRIM(stage), ''),
country = REGEXP_REPLACE(TRIM(country), '\\.$', ''); -- drop trailing '.' in "United States."
/* 2) Robust date typing (convert only valid M/D/Y; keep other formats unchanged for review) */
UPDATE layoffs_staging
SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y')
WHERE `date` REGEXP '^[0-1]?[0-9]/[0-3]?[0-9]/[1-2][0-9]{3}$';
ALTER TABLE layoffs_staging MODIFY COLUMN `date` DATE;
/* 3) Generated column: numeric percent from text like '25%' or '0.25' */
ALTER TABLE layoffs_staging
ADD COLUMN pct_laid_off DECIMAL(6,4)
GENERATED ALWAYS AS (
CASE
WHEN percentage_laid_off IS NULL OR percentage_laid_off = '' THEN NULL
WHEN percentage_laid_off LIKE '%\\%%' THEN CAST(REPLACE(percentage_laid_off, '%', '') AS DECIMAL(6,3)) / 100
ELSE CAST(percentage_laid_off AS DECIMAL(6,3))
END
) VIRTUAL;
/* 4) Enforce domain constraints (MySQL 8.0.16+ enforces CHECK) */
ALTER TABLE layoffs_staging
ADD CONSTRAINT chk_pct_range CHECK (pct_laid_off IS NULL OR (pct_laid_off BETWEEN 0 AND 1));
/* 5) Canonicalize industries via patterns (regex mapping) */
WITH industry_map AS (
SELECT 'crypto|block.?chain|web ?3' AS rx, 'Crypto' AS canon UNION ALL
SELECT 'travel|touris(m|t)|hospitality', 'Travel' UNION ALL
SELECT 'fintech|financial tech|payments', 'Fintech' UNION ALL
SELECT 'ai|machine learning|deep learning', 'AI/ML' UNION ALL
SELECT 'health|med(ical)? tech|pharma', 'Healthcare' UNION ALL
SELECT 'e-?commerce|retail', 'E-commerce'
)
UPDATE layoffs_staging t
JOIN industry_map m
ON t.industry IS NOT NULL AND LOWER(t.industry) REGEXP m.rx
SET t.industry = m.canon;
/* 6) Impute missing industry with company "mode" (most frequent) */
UPDATE layoffs_staging t
JOIN (
SELECT company,
SUBSTRING_INDEX(GROUP_CONCAT(industry ORDER BY cnt DESC SEPARATOR ','), ',', 1) AS mode_industry
FROM (
SELECT company, industry, COUNT(*) AS cnt
FROM layoffs_staging
WHERE industry IS NOT NULL
GROUP BY company, industry
) s
GROUP BY company
) m ON t.company = m.company
SET t.industry = m.mode_industry
WHERE (t.industry IS NULL);
/* 7) De-duplicate using window functions + joined delete */
WITH ranked AS (
SELECT _row_id,
ROW_NUMBER() OVER (
PARTITION BY company, location, industry, total_laid_off, pct_laid_off, `date`, stage, country, funds_raised_millions
ORDER BY _row_id
) AS rn
FROM layoffs_staging
)
DELETE t
FROM layoffs_staging t
JOIN ranked r ON r._row_id = t._row_id
WHERE r.rn > 1;
/* 8) Prevent future dupes (natural-ish unique index, using prefixes for TEXT cols) */
DROP INDEX IF EXISTS ux_layoffs_natural ON layoffs_staging;
CREATE UNIQUE INDEX ux_layoffs_natural
ON layoffs_staging (company(64), location(64), industry(64), total_laid_off, pct_laid_off, `date`, stage(64), country(64), funds_raised_millions);
/* 9) Remove rows with no signal (both total and pct null) */
DELETE FROM layoffs_staging
WHERE total_laid_off IS NULL AND pct_laid_off IS NULL;
/* 10) Optional: promote to a cleaned table (idempotent CTAS) */
DROP TABLE IF EXISTS layoffs_cleaned;
CREATE TABLE layoffs_cleaned AS
SELECT
company, location, industry, total_laid_off, pct_laid_off,
`date`, stage, country, funds_raised_millions
FROM layoffs_staging;
-- Useful indexes for analysis
ALTER TABLE layoffs_cleaned
ADD INDEX ix_dt (`date`),
ADD INDEX ix_company (company(64)),
ADD INDEX ix_industry (industry(64), `date`);
Advanced analytics (recursive CTE calendar, rolling windows, YoY deltas, top-N per group, ROLLUP, anomalies).
SQL Code
/* ===============================
ADVANCED EDA (MySQL 8+)
=============================== */
-- 1) Complete month calendar and monthly totals (handles gaps)
WITH bounds AS (
SELECT DATE_FORMAT(MIN(`date`), '%Y-%m-01') AS start_mon,
DATE_FORMAT(MAX(`date`), '%Y-%m-01') AS end_mon
FROM layoffs_cleaned
),
RECURSIVE months(mon) AS (
SELECT start_mon FROM bounds
UNION ALL
SELECT DATE_ADD(mon, INTERVAL 1 MONTH) FROM months m, bounds b WHERE mon < b.end_mon
),
monthly AS (
SELECT m.mon AS month_start,
COALESCE(SUM(c.total_laid_off), 0) AS total_off
FROM months m
LEFT JOIN layoffs_cleaned c
ON DATE_FORMAT(c.`date`, '%Y-%m-01') = m.mon
GROUP BY m.mon
)
SELECT
month_start,
total_off,
-- rolling 3/6/12 months
SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS roll_3m,
SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS roll_6m,
SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS roll_12m,
-- YoY deltas
total_off - LAG(total_off, 12) OVER (ORDER BY month_start) AS yoy_abs,
CASE
WHEN LAG(total_off, 12) OVER (ORDER BY month_start) IS NULL OR LAG(total_off, 12) OVER (ORDER BY month_start) = 0
THEN NULL
ELSE (total_off / LAG(total_off, 12) OVER (ORDER BY month_start)) - 1
END AS yoy_pct
FROM monthly
ORDER BY month_start;
-- 2) Industry monthly trends with shares and anomaly z-scores
WITH by_industry AS (
SELECT
DATE_FORMAT(`date`, '%Y-%m-01') AS month_start,
industry,
SUM(total_laid_off) AS total_off
FROM layoffs_cleaned
GROUP BY DATE_FORMAT(`date`, '%Y-%m-01'), industry
),
with_totals AS (
SELECT
bi.*,
SUM(total_off) OVER (PARTITION BY month_start) AS month_total
FROM by_industry bi
),
scored AS (
SELECT
month_start, industry, total_off, month_total,
total_off / NULLIF(month_total, 0) AS month_share,
AVG(total_off) OVER (PARTITION BY industry) AS mean_off,
STDDEV_SAMP(total_off) OVER (PARTITION BY industry) AS sd_off
FROM with_totals
)
SELECT
month_start, industry, total_off, month_share,
CASE WHEN sd_off IS NULL OR sd_off = 0 THEN 0 ELSE (total_off - mean_off)/sd_off END AS z_score
FROM scored
ORDER BY month_start, industry;
-- 3) Top-5 companies by year (per industry), with rank and contribution
WITH company_year AS (
SELECT
industry,
company,
YEAR(`date`) AS yr,
SUM(total_laid_off) AS total_off
FROM layoffs_cleaned
WHERE `date` IS NOT NULL
GROUP BY industry, company, YEAR(`date`)
),
ranked AS (
SELECT *,
DENSE_RANK() OVER (PARTITION BY industry, yr ORDER BY total_off DESC) AS rnk,
SUM(total_off) OVER (PARTITION BY industry, yr) AS yr_total
FROM company_year
)
SELECT
industry, yr, rnk, company, total_off,
total_off / NULLIF(yr_total, 0) AS pct_of_year_industry
FROM ranked
WHERE rnk <= 5
ORDER BY industry, yr, rnk, total_off DESC;
-- 4) Country and stage rollups (group totals + grand totals)
SELECT
country,
stage,
SUM(total_laid_off) AS total_off
FROM layoffs_cleaned
GROUP BY country, stage WITH ROLLUP
ORDER BY country IS NULL, country, stage IS NULL, stage;
-- 5) Cohort: first-layoff cohort and recurrence intervals
WITH events AS (
SELECT
company,
`date`,
total_laid_off,
MIN(`date`) OVER (PARTITION BY company) AS cohort_start,
LAG(`date`) OVER (PARTITION BY company ORDER BY `date`) AS prev_date
FROM layoffs_cleaned
WHERE `date` IS NOT NULL
),
intervals AS (
SELECT
company,
cohort_start,
`date`,
DATEDIFF(`date`, cohort_start) AS days_since_first,
DATEDIFF(`date`, prev_date) AS days_since_prev,
total_laid_off
FROM events
)
SELECT *
FROM intervals
ORDER BY company, `date`;
-- 6) Pivot-style year x industry matrix (conditional aggregation)
SELECT
YEAR(`date`) AS yr,
SUM(CASE WHEN industry = 'AI/ML' THEN total_laid_off ELSE 0 END) AS ai_ml,
SUM(CASE WHEN industry = 'Crypto' THEN total_laid_off ELSE 0 END) AS crypto,
SUM(CASE WHEN industry = 'E-commerce' THEN total_laid_off ELSE 0 END) AS ecom,
SUM(CASE WHEN industry = 'Fintech' THEN total_laid_off ELSE 0 END) AS fintech,
SUM(CASE WHEN industry = 'Healthcare' THEN total_laid_off ELSE 0 END) AS healthcare,
SUM(CASE WHEN industry = 'Travel' THEN total_laid_off ELSE 0 END) AS travel
FROM layoffs_cleaned
GROUP BY YEAR(`date`)
ORDER BY yr DESC;
-- 7) Running share of a company within its industry over time (TTM)
WITH company_month AS (
SELECT
DATE_FORMAT(`date`, '%Y-%m-01') AS mon,
industry,
company,
SUM(total_laid_off) AS off_m
FROM layoffs_cleaned
GROUP BY DATE_FORMAT(`date`, '%Y-%m-01'), industry, company
),
ttm AS (
SELECT
mon, industry, company, off_m,
SUM(off_m) OVER (PARTITION BY industry, company ORDER BY mon ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS ttm_company,
SUM(off_m) OVER (PARTITION BY industry ORDER BY mon ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS ttm_industry
FROM company_month
)
SELECT
mon, industry, company,
ttm_company, ttm_industry,
ttm_company / NULLIF(ttm_industry, 0) AS ttm_share
FROM ttm
ORDER BY mon, industry, ttm_share DESC;