Data Cleaning Project

Streamlining global layoff data using MySQL 8.0+ (CTEs, window functions, regex, generated columns, constraints).

SQL Code


/* ===============================
   ADVANCED DATA CLEANING (MySQL 8+)
   =============================== */

/* 0) Create staging and add a surrogate key for safe de-dup */
DROP TABLE IF EXISTS layoffs_staging;
CREATE TABLE layoffs_staging LIKE layoffs;
INSERT INTO layoffs_staging SELECT * FROM layoffs;

ALTER TABLE layoffs_staging
  ADD COLUMN _row_id BIGINT NOT NULL AUTO_INCREMENT PRIMARY KEY;

/* 1) Canonical text cleanup using REGEXP_REPLACE + TRIM */
UPDATE layoffs_staging
SET
  company = REGEXP_REPLACE(TRIM(company), '\\s+', ' '),
  location = REGEXP_REPLACE(TRIM(location), '\\s+', ' '),
  industry = NULLIF(TRIM(industry), ''),
  stage = NULLIF(TRIM(stage), ''),
  country = REGEXP_REPLACE(TRIM(country), '\\.$', '');  -- drop trailing '.' in "United States."

/* 2) Robust date typing (convert only valid M/D/Y; keep other formats unchanged for review) */
UPDATE layoffs_staging
SET `date` = STR_TO_DATE(`date`, '%m/%d/%Y')
WHERE `date` REGEXP '^[0-1]?[0-9]/[0-3]?[0-9]/[1-2][0-9]{3}$';

ALTER TABLE layoffs_staging MODIFY COLUMN `date` DATE;

/* 3) Generated column: numeric percent from text like '25%' or '0.25' */
ALTER TABLE layoffs_staging
  ADD COLUMN pct_laid_off DECIMAL(6,4)
    GENERATED ALWAYS AS (
      CASE
        WHEN percentage_laid_off IS NULL OR percentage_laid_off = '' THEN NULL
        WHEN percentage_laid_off LIKE '%\\%%' THEN CAST(REPLACE(percentage_laid_off, '%', '') AS DECIMAL(6,3)) / 100
        ELSE CAST(percentage_laid_off AS DECIMAL(6,3))
      END
    ) VIRTUAL;

/* 4) Enforce domain constraints (MySQL 8.0.16+ enforces CHECK) */
ALTER TABLE layoffs_staging
  ADD CONSTRAINT chk_pct_range CHECK (pct_laid_off IS NULL OR (pct_laid_off BETWEEN 0 AND 1));

/* 5) Canonicalize industries via patterns (regex mapping) */
WITH industry_map AS (
  SELECT 'crypto|block.?chain|web ?3' AS rx, 'Crypto' AS canon UNION ALL
  SELECT 'travel|touris(m|t)|hospitality',       'Travel' UNION ALL
  SELECT 'fintech|financial tech|payments',      'Fintech' UNION ALL
  SELECT 'ai|machine learning|deep learning',    'AI/ML'  UNION ALL
  SELECT 'health|med(ical)? tech|pharma',        'Healthcare' UNION ALL
  SELECT 'e-?commerce|retail',                   'E-commerce'
)
UPDATE layoffs_staging t
JOIN industry_map m
  ON t.industry IS NOT NULL AND LOWER(t.industry) REGEXP m.rx
SET t.industry = m.canon;

/* 6) Impute missing industry with company "mode" (most frequent) */
UPDATE layoffs_staging t
JOIN (
  SELECT company,
         SUBSTRING_INDEX(GROUP_CONCAT(industry ORDER BY cnt DESC SEPARATOR ','), ',', 1) AS mode_industry
  FROM (
    SELECT company, industry, COUNT(*) AS cnt
    FROM layoffs_staging
    WHERE industry IS NOT NULL
    GROUP BY company, industry
  ) s
  GROUP BY company
) m ON t.company = m.company
SET t.industry = m.mode_industry
WHERE (t.industry IS NULL);

/* 7) De-duplicate using window functions + joined delete */
WITH ranked AS (
  SELECT _row_id,
         ROW_NUMBER() OVER (
           PARTITION BY company, location, industry, total_laid_off, pct_laid_off, `date`, stage, country, funds_raised_millions
           ORDER BY _row_id
         ) AS rn
  FROM layoffs_staging
)
DELETE t
FROM layoffs_staging t
JOIN ranked r ON r._row_id = t._row_id
WHERE r.rn > 1;

/* 8) Prevent future dupes (natural-ish unique index, using prefixes for TEXT cols) */
DROP INDEX IF EXISTS ux_layoffs_natural ON layoffs_staging;
CREATE UNIQUE INDEX ux_layoffs_natural
ON layoffs_staging (company(64), location(64), industry(64), total_laid_off, pct_laid_off, `date`, stage(64), country(64), funds_raised_millions);

/* 9) Remove rows with no signal (both total and pct null) */
DELETE FROM layoffs_staging
WHERE total_laid_off IS NULL AND pct_laid_off IS NULL;

/* 10) Optional: promote to a cleaned table (idempotent CTAS) */
DROP TABLE IF EXISTS layoffs_cleaned;
CREATE TABLE layoffs_cleaned AS
SELECT
  company, location, industry, total_laid_off, pct_laid_off,
  `date`, stage, country, funds_raised_millions
FROM layoffs_staging;

-- Useful indexes for analysis
ALTER TABLE layoffs_cleaned
  ADD INDEX ix_dt (`date`),
  ADD INDEX ix_company (company(64)),
  ADD INDEX ix_industry (industry(64), `date`);
          

Exploratory Data Analysis Project

Advanced analytics (recursive CTE calendar, rolling windows, YoY deltas, top-N per group, ROLLUP, anomalies).

SQL Code


/* ===============================
   ADVANCED EDA (MySQL 8+)
   =============================== */

-- 1) Complete month calendar and monthly totals (handles gaps)
WITH bounds AS (
  SELECT DATE_FORMAT(MIN(`date`), '%Y-%m-01') AS start_mon,
         DATE_FORMAT(MAX(`date`), '%Y-%m-01') AS end_mon
  FROM layoffs_cleaned
),
RECURSIVE months(mon) AS (
  SELECT start_mon FROM bounds
  UNION ALL
  SELECT DATE_ADD(mon, INTERVAL 1 MONTH) FROM months m, bounds b WHERE mon < b.end_mon
),
monthly AS (
  SELECT m.mon AS month_start,
         COALESCE(SUM(c.total_laid_off), 0) AS total_off
  FROM months m
  LEFT JOIN layoffs_cleaned c
         ON DATE_FORMAT(c.`date`, '%Y-%m-01') = m.mon
  GROUP BY m.mon
)
SELECT
  month_start,
  total_off,
  -- rolling 3/6/12 months
  SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 2 PRECEDING AND CURRENT ROW) AS roll_3m,
  SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 5 PRECEDING AND CURRENT ROW) AS roll_6m,
  SUM(total_off) OVER (ORDER BY month_start ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS roll_12m,
  -- YoY deltas
  total_off - LAG(total_off, 12) OVER (ORDER BY month_start) AS yoy_abs,
  CASE
    WHEN LAG(total_off, 12) OVER (ORDER BY month_start) IS NULL OR LAG(total_off, 12) OVER (ORDER BY month_start) = 0
      THEN NULL
    ELSE (total_off / LAG(total_off, 12) OVER (ORDER BY month_start)) - 1
  END AS yoy_pct
FROM monthly
ORDER BY month_start;

-- 2) Industry monthly trends with shares and anomaly z-scores
WITH by_industry AS (
  SELECT
    DATE_FORMAT(`date`, '%Y-%m-01') AS month_start,
    industry,
    SUM(total_laid_off) AS total_off
  FROM layoffs_cleaned
  GROUP BY DATE_FORMAT(`date`, '%Y-%m-01'), industry
),
with_totals AS (
  SELECT
    bi.*,
    SUM(total_off) OVER (PARTITION BY month_start) AS month_total
  FROM by_industry bi
),
scored AS (
  SELECT
    month_start, industry, total_off, month_total,
    total_off / NULLIF(month_total, 0) AS month_share,
    AVG(total_off)  OVER (PARTITION BY industry) AS mean_off,
    STDDEV_SAMP(total_off) OVER (PARTITION BY industry) AS sd_off
  FROM with_totals
)
SELECT
  month_start, industry, total_off, month_share,
  CASE WHEN sd_off IS NULL OR sd_off = 0 THEN 0 ELSE (total_off - mean_off)/sd_off END AS z_score
FROM scored
ORDER BY month_start, industry;

-- 3) Top-5 companies by year (per industry), with rank and contribution
WITH company_year AS (
  SELECT
    industry,
    company,
    YEAR(`date`) AS yr,
    SUM(total_laid_off) AS total_off
  FROM layoffs_cleaned
  WHERE `date` IS NOT NULL
  GROUP BY industry, company, YEAR(`date`)
),
ranked AS (
  SELECT *,
         DENSE_RANK() OVER (PARTITION BY industry, yr ORDER BY total_off DESC) AS rnk,
         SUM(total_off) OVER (PARTITION BY industry, yr) AS yr_total
  FROM company_year
)
SELECT
  industry, yr, rnk, company, total_off,
  total_off / NULLIF(yr_total, 0) AS pct_of_year_industry
FROM ranked
WHERE rnk <= 5
ORDER BY industry, yr, rnk, total_off DESC;

-- 4) Country and stage rollups (group totals + grand totals)
SELECT
  country,
  stage,
  SUM(total_laid_off) AS total_off
FROM layoffs_cleaned
GROUP BY country, stage WITH ROLLUP
ORDER BY country IS NULL, country, stage IS NULL, stage;

-- 5) Cohort: first-layoff cohort and recurrence intervals
WITH events AS (
  SELECT
    company,
    `date`,
    total_laid_off,
    MIN(`date`) OVER (PARTITION BY company) AS cohort_start,
    LAG(`date`) OVER (PARTITION BY company ORDER BY `date`) AS prev_date
  FROM layoffs_cleaned
  WHERE `date` IS NOT NULL
),
intervals AS (
  SELECT
    company,
    cohort_start,
    `date`,
    DATEDIFF(`date`, cohort_start) AS days_since_first,
    DATEDIFF(`date`, prev_date)   AS days_since_prev,
    total_laid_off
  FROM events
)
SELECT *
FROM intervals
ORDER BY company, `date`;

-- 6) Pivot-style year x industry matrix (conditional aggregation)
SELECT
  YEAR(`date`) AS yr,
  SUM(CASE WHEN industry = 'AI/ML'      THEN total_laid_off ELSE 0 END) AS ai_ml,
  SUM(CASE WHEN industry = 'Crypto'     THEN total_laid_off ELSE 0 END) AS crypto,
  SUM(CASE WHEN industry = 'E-commerce' THEN total_laid_off ELSE 0 END) AS ecom,
  SUM(CASE WHEN industry = 'Fintech'    THEN total_laid_off ELSE 0 END) AS fintech,
  SUM(CASE WHEN industry = 'Healthcare' THEN total_laid_off ELSE 0 END) AS healthcare,
  SUM(CASE WHEN industry = 'Travel'     THEN total_laid_off ELSE 0 END) AS travel
FROM layoffs_cleaned
GROUP BY YEAR(`date`)
ORDER BY yr DESC;

-- 7) Running share of a company within its industry over time (TTM)
WITH company_month AS (
  SELECT
    DATE_FORMAT(`date`, '%Y-%m-01') AS mon,
    industry,
    company,
    SUM(total_laid_off) AS off_m
  FROM layoffs_cleaned
  GROUP BY DATE_FORMAT(`date`, '%Y-%m-01'), industry, company
),
ttm AS (
  SELECT
    mon, industry, company, off_m,
    SUM(off_m) OVER (PARTITION BY industry, company ORDER BY mon ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS ttm_company,
    SUM(off_m) OVER (PARTITION BY industry ORDER BY mon ROWS BETWEEN 11 PRECEDING AND CURRENT ROW) AS ttm_industry
  FROM company_month
)
SELECT
  mon, industry, company,
  ttm_company, ttm_industry,
  ttm_company / NULLIF(ttm_industry, 0) AS ttm_share
FROM ttm
ORDER BY mon, industry, ttm_share DESC;