// Doc Parser — upload payslip or offer letter → auto-fill calculator
// Preview only (?preview=1). Fully client-side: PDF.js text extraction + pattern matching.
// Nothing is uploaded or stored.
const { useState: dpUseState, useCallback: dpUseCallback, useRef: dpUseRef } = React;

// ── Field definitions — label patterns matched against extracted PDF text ──
const FIELD_PATTERNS = [
  {
    field: "basic", label: "Basic Salary", monthly: true,
    // "annual basic salary" = actual basic component (Annexure tables).
    // "annual base salary" / "base salary" intentionally excluded — in MNC offers those
    // refer to the TOTAL fixed package (basic + allowances), handled in META grossPay.
    // "base pay" kept as it often means the actual basic in standalone context.
    patterns: ["annual basic salary", "basic salary", "basic pay", "basic wages", "basic compensation", "basic sal", "bsc sal", "fixed basic", "fixed pay", "base pay", "base compensation", "basic"],
  },
  {
    field: "hra", label: "HRA", monthly: true,
    patterns: ["house rent allowance", "house rent allow", "house rent", "hra allowance", "h.r.a.", "hra"],
  },
  {
    field: "da", label: "Dearness Allowance (DA)", monthly: true,
    patterns: ["dearness allowance", "dearness pay", "dear allow", " da "],
  },
  {
    field: "lta", label: "LTA", monthly: true,
    // "lta" bare pattern relies on word-boundary check — safe even at line start
    // (handles Citi/payslip tables where the row is just "LTA   70,000").
    patterns: ["leave travel allowance", "leave travel assistance", "leave travel concession", "leave travel assist", "lta allowance", "ltc allowance", "lta", "ltc"],
  },
  {
    field: "conveyance", label: "Conveyance", monthly: true,
    patterns: ["conveyance allowance", "conveyance allow", "transport allowance", "transportation allowance", "travelling allowance", "travel allowance", "city allowance", "conveyance"],
  },
  {
    field: "medical", label: "Medical Allowance", monthly: true,
    patterns: ["medical allowance", "medical reimbursement", "medical allow", "medical exp", "medical benefit", "medical"],
  },
  {
    field: "telephone", label: "Telephone / Mobile", monthly: true,
    patterns: ["telephone allowance", "mobile allowance", "communication allowance", "internet allowance", "mobile reimb", "telephone reimb", "telephone", "mobile allow"],
  },
  {
    field: "special", label: "Special Allowance", monthly: true,
    // accumulate: true — when multiple lines match (e.g. "Special Allowance 1,700" on one
    // page and "Personal Allowance 6,680" on another), add the values rather than keeping
    // only the first. Both roll up to the single "special" calculator field.
    accumulate: true,
    patterns: ["special allowance", "special pay", "special comp", "personal allowance", "lpa amount", "location premium allowance", "flexi benefit plan", "flexi benefit", "flexible benefit plan", "flexible benefit", "flexi pay", "flexible pay", "other allowance", "miscellaneous allowance", "flexible allowance", "flexi allowance", "performance allowance", "supplementary allowance", "additional allowance", "fixed allowance", "basket of allowances", "basket of allow", " boa ", "bouquet of benefits", "bouquet of benefit", "exgratia", "ex-gratia", "ex gratia", "bonus / ex", "bonus/ex"],
  },
  {
    field: "perfPay", label: "Monthly Performance Pay", monthly: true,
    patterns: ["monthly performance pay", "monthly performance", "guaranteed monthly bonus", "monthly bonus"],
  },
  {
    // Joining / sign-on bonus shown with its own label in the review UI,
    // but applied to state.bonusAnnual (same tax treatment as annual bonus).
    field: "joiningBonus", label: "Joining / Sign-on Bonus", monthly: false, isAnnual: true,
    stateField: "bonusAnnual",
    patterns: ["joining bonus", "sign-on bonus", "signing bonus", "join bonus", "one-time bonus", "onetime bonus"],
  },
  {
    field: "bonusAnnual", label: "Annual Bonus / Incentive", monthly: false, isAnnual: true,
    patterns: ["retention bonus", "performance pay", "performance incentive", "annual performance", "performance linked", "quarterly performance"],
  },
  {
    field: "variable", label: "Variable Pay (Annual)", monthly: false, isAnnual: true,
    patterns: ["variable pay", "target bonus", "annual bonus target", "bonus target", "quarterly variable allowance", "quarterly variable", "annual variable", "variable component"],
  },
  {
    field: "meal", label: "Meal / Food", monthly: true,
    patterns: ["food allowance", "meal allowance", "sodexo", "meal voucher", "food voucher", "food card", "canteen allowance", "food coupons", "food coupon", "lunch allowance", "meal reimb"],
  },
  {
    field: "car", label: "Car / Vehicle Allowance", monthly: true,
    patterns: ["car allowance", "vehicle allowance", "automobile allowance", "car perquisite", "car maintenance allowance", "car lease allowance"],
  },
  {
    field: "cea", label: "Children Education", monthly: true,
    patterns: ["children education allowance", "child education allowance", "children edu", "child edu allow", "cea"],
  },
  {
    field: "profTaxMonthly", label: "Professional Tax", monthly: true, isDeduction: true,
    patterns: ["professional tax", "profession tax", "p.tax", "pt deduction", "prof. tax", "prof tax", "state tax"],
  },
];

// Summary-level fields (for document-type detection and net pay display only, not applied to state)
const META_PATTERNS = [
  // monthlyGross is checked first and NEVER divided by 12 — it's an explicit monthly figure.
  // This takes priority over grossPay (which may come from an annual CTC line ÷12).
  { field: "monthlyGross", monthly: true,  patterns: ["monthly gross salary", "gross monthly salary", "monthly gross pay", "gross salary per month", "monthly gross"] },
  { field: "grossPay",     monthly: false, patterns: ["annual base salary", "base salary", "on-target earnings", "on target earnings", "gross salary", "gross pay", "gross earnings", "total earnings", "gross ctc", "total gross", "total cash", "gross amount", "total emoluments", "gross bouquet of benefits", "gross bouquet"] },
  { field: "netPay",       monthly: false, patterns: ["net salary", "net pay", "take home", "in hand salary", "net amount payable", "net amount", "net payable", "amount payable", "net wages"] },
];

// ── Text extraction using PDF.js ──────────────────────────────────────────

// Extract structured text from one PDF page using PDF.js text content API.
// Returns an array of line strings, grouping items by y-position.
async function extractPageTextLines(page) {
  const content = await page.getTextContent();
  const byY = {};
  for (const item of content.items) {
    if (!item.str.trim()) continue;
    // 6pt grid — handles slight vertical misalignment in tables
    const y = Math.round(item.transform[5] / 6) * 6;
    if (!byY[y]) byY[y] = [];
    byY[y].push({ x: item.transform[4], text: item.str });
  }
  const lines = [];
  // Sort top→bottom (descending y in PDF coordinate space)
  const sortedYs = Object.keys(byY).map(Number).sort((a, b) => b - a);
  for (const y of sortedYs) {
    const items = byY[y].sort((a, b) => a.x - b.x);
    lines.push(items.map(i => i.text).join(" "));
    // Also emit each large-gap chunk individually — helps two-column tables
    if (items.length >= 2) {
      for (let i = 0; i < items.length - 1; i++) {
        if (items[i + 1].x - items[i].x > 80) {
          lines.push(items.slice(0, i + 1).map(t => t.text).join(" "));
          lines.push(items.slice(i + 1).map(t => t.text).join(" "));
        }
      }
    }
  }
  return lines;
}

// ── OCR fallback for scanned / image-only PDFs ────────────────────────────
// Loads Tesseract.js lazily (only when needed) then renders each PDF page to
// an off-screen canvas and runs English OCR on it.
// onProgress(msg) is called with human-readable status strings.

async function loadTesseract() {
  if (window.Tesseract) return window.Tesseract;
  return new Promise((resolve, reject) => {
    const s = document.createElement("script");
    // v5: auto-detects worker/core/lang paths from its own CDN location —
    // no explicit path config needed, avoids cross-origin worker issues.
    s.src = "https://unpkg.com/tesseract.js@5.0.5/dist/tesseract.min.js";
    s.onload  = () => window.Tesseract ? resolve(window.Tesseract) : reject(new Error("Tesseract global missing"));
    s.onerror = () => reject(new Error("Could not load OCR script"));
    document.head.appendChild(s);
  });
}

async function renderPageToCanvas(pdfPage, scale = 2.0) {
  const viewport = pdfPage.getViewport({ scale });
  const canvas = document.createElement("canvas");
  canvas.width  = Math.round(viewport.width);
  canvas.height = Math.round(viewport.height);
  await pdfPage.render({ canvasContext: canvas.getContext("2d"), viewport }).promise;
  return canvas;
}

async function ocrPDF(pdf, maxPages, onProgress) {
  const T = await loadTesseract();
  onProgress("Initialising OCR engine…");

  // v5 API: createWorker(lang, oem?, options?)
  // Paths are auto-resolved from the loaded script URL — no explicit config needed.
  const worker = await T.createWorker("eng", 1, {
    logger: m => {
      if (m.status === "recognizing text") {
        onProgress(`Recognising text… ${Math.round((m.progress || 0) * 100)}%`);
      }
    },
  });

  const lines = [];
  const pagesToOCR = Math.min(pdf.numPages, maxPages);
  for (let p = 1; p <= pagesToOCR; p++) {
    onProgress(`OCR: scanning page ${p} of ${pagesToOCR}…`);
    const canvas = await renderPageToCanvas(await pdf.getPage(p), 2.0);
    const { data: { text } } = await worker.recognize(canvas);
    const pageLines = text.split("\n").map(l => l.trim()).filter(l => l.length > 1);
    lines.push(...pageLines, "---page---");
  }

  await worker.terminate();
  return lines;
}

// Main entry: try PDF.js text extraction first; if too little text is found
// (scanned / image-based PDF), fall back to Tesseract OCR.
async function extractTextFromPDF(file, onProgress) {
  if (!window.pdfjsLib) throw new Error("PDF.js not loaded");
  const arrayBuffer = await file.arrayBuffer();
  const pdf = await window.pdfjsLib.getDocument({ data: new Uint8Array(arrayBuffer) }).promise;

  // ── Pass 1: PDF.js native text extraction (fast, works for digital PDFs) ──
  let lines = [];
  for (let p = 1; p <= Math.min(pdf.numPages, 10); p++) {
    const pageLines = await extractPageTextLines(await pdf.getPage(p));
    lines.push(...pageLines, "---page---");
  }

  // ── Pass 2: OCR fallback if page text is sparse ─────────────────────────
  // Threshold: fewer than 150 non-whitespace chars → almost certainly a scan.
  const totalChars = lines.filter(l => l && l !== "---page---").join("").replace(/\s/g, "").length;
  if (totalChars < 150) {
    onProgress("Scanned PDF detected — starting OCR (may take 30–60 s)…");
    try {
      lines = await ocrPDF(pdf, 6, onProgress);
    } catch (e) {
      // Re-throw with a clear prefix so processFile can show a specific message
      throw new Error("OCR_FAILED: " + (e.message || e));
    }
  }

  return lines;
}

// ── Two-column monthly/annual detector ───────────────────────────────────
// When a salary table has Monthly | Annual columns, extract the monthly value.
// Detects pairs of numbers where one is ~12× the other (ratio 10–14) and returns
// the smaller (monthly) one. Returns null if pattern not found.
function extractTwoColMonthly(text) {
  if (!text) return null;
  const cleaned = cleanForNum(text);
  const matches = [...cleaned.matchAll(/(\d[\d,]*(?:\.\d+)?)/g)];
  const nums = matches
    .map(m => parseFloat(m[1].replace(/,/g, "")))
    .filter(n => n >= 100);
  for (let i = 0; i < nums.length; i++) {
    for (let j = i + 1; j < nums.length; j++) {
      const ratio = nums[j] / nums[i];
      // 10.5–13.6 catches 12× (standard) and the TCS TOTAL GROSS row (700022/51844 ≈ 13.505)
      if (ratio >= 10.5 && ratio <= 13.6) return nums[i]; // smaller = monthly
    }
  }
  return null;
}

// ── Number extractor — handles ₹1,23,456 · 12345 · 1234.56 ─────────────
function cleanForNum(text) {
  return text.replace(/[₹`$€£,%]/g, "").replace(/\/-/g, "").replace(/Rs\.?/gi, "").trim();
}
function extractNumber(text) {
  if (!text) return null;
  const cleaned = cleanForNum(text);
  // Standalone number
  const standalone = cleaned.match(/^-?(\d+(?:\.\d+)?)$/);
  if (standalone) { const n = parseFloat(standalone[1]); return n > 0 ? n : null; }
  // Trailing number (most reliable for tables)
  const trailing = cleaned.match(/(\d[\d,]*(?:\.\d+)?)\s*(?:\/mo|\/month|pm|p\.m\.)?$/i);
  if (trailing) {
    const n = parseFloat(trailing[1].replace(/,/g, ""));
    return n > 0 ? n : null;
  }
  return null;
}
// Finds a number explicitly preceded by a currency marker (₹ ` Rs INR)
// Used for mid-sentence salary figures like "will be ` 7,00,022/- per annum"
// Also covers OCR misreads of ₹: CamScanner / low-res scans frequently produce
// € ¥ £ ¢ instead of ₹ — all are included so OCR output still parses correctly.
function extractCurrencyNumber(text) {
  if (!text) return null;
  // Match: currency marker then optional space then number (Indian comma format)
  const m = text.match(/(?:₹|`|Rs\.?|INR|[€¥£¢])\s*([\d,]+(?:\.\d+)?)/i);
  if (m) {
    const n = parseFloat(m[1].replace(/,/g, ""));
    return n > 0 ? n : null;
  }
  return null;
}

// Finds a number in Indian lakh format (e.g. 3,36,875 or 28,073) even when
// the currency symbol is entirely missing or garbled in OCR output.
// Only used as a last-resort fallback — returns null if no lakh-format match.
function extractIndianFormatNumber(text) {
  if (!text) return null;
  // Indian lakh: 1–2 digits, comma, 2 digits, comma, 3 digits (e.g. 3,36,875 / 28,073)
  const m = text.match(/\b(\d{1,2},\d{2},\d{3}(?:\.\d+)?|\d{2},\d{3}(?:\.\d+)?)\b/);
  if (m) {
    const n = parseFloat(m[1].replace(/,/g, ""));
    return n >= 1000 ? n : null;
  }
  return null;
}

// ── Main extraction engine ────────────────────────────────────────────────
function extractSalaryFields(lines) {
  const found = {};  // field → { value, confidence, rawLine }
  const meta  = {};  // grossPay, netPay

  const fullText = lines.join(" ").toLowerCase();

  // Detect if values are likely annual (offer letter) or monthly (payslip)
  // Monthly signals take priority — if the doc explicitly says "per month" near figures, trust that
  const hasMonthlySignal = /figures\s*in\s*inr\s*per\s*month|all\s*figures.*per\s*month|amount.*per\s*month|monthly\s*components|monthly\s*gross|monthly\s*ctc|payslip|pay\s*slip|salary\s*slip/i.test(fullText);
  // "\bctc\b" catches Citi-style letters that label amounts as "CTC (INR)" without
  // ever saying "per annum". CTC is always an annual figure in Indian HR documents.
  // "amt in inr" catches annexure headers like "PARTICULARS - AMT in INR (Per Annum)".
  const hasAnnualSignal  = /per\s*annum|p\.a\.|\/yr|annual\s*ctc|annual\s*salary|annual\s*base\s*salary|annual\s*base\s*pay|cost\s*to\s*company|total\s*annual\s*comp|\bctc\b|amt\s*in\s*inr/i.test(fullText);
  const likelyAnnual = hasAnnualSignal && !hasMonthlySignal;

  // Scan full text for an explicitly stated monthly gross figure
  // TCS offer letters say: "Monthly Gross Salary...will be `51,844/-"
  // We prefer this over annual÷12 because annual CTC includes non-recurring items
  // (retention bonus, health insurance) that inflate the monthly estimate.
  const explicitMonthlyGrossMatch = fullText.match(
    /monthly\s+gross(?:\s+salary)?[^₹`\d\n]{0,80}(?:₹|`|rs\.?|inr)\s*([\d,]+)/i
  );
  const explicitMonthlyGross = explicitMonthlyGrossMatch
    ? (() => { const n = parseFloat(explicitMonthlyGrossMatch[1].replace(/,/g, "")); return n > 1000 ? n : null; })()
    : null;

  // Deduplicated lines (splitting can create duplicate lines — skip re-processing)
  const seen = new Set();
  // Track which line indices have already been claimed by a field match.
  // Prevents a broad pattern (e.g. "performance pay") from double-matching a line
  // that a more specific pattern already consumed (e.g. "monthly performance pay").
  const claimedLines = new Set();

  for (let i = 0; i < lines.length; i++) {
    const raw = lines[i];
    if (!raw || raw === "---page---") continue;
    if (seen.has(raw)) continue;
    seen.add(raw);

    const lower = raw.toLowerCase().replace(/[:\-–|]/g, " ").replace(/\s+/g, " ").trim();

    // ── Component fields ──
    for (const def of FIELD_PATTERNS) {
      // Skip if already found — UNLESS this field accumulates (e.g. special allowance,
      // where "Special Allowance 1,700" and "Personal Allowance 6,680" both map to the
      // same field and should be summed).
      if (found[def.field] && !def.accumulate) continue;
      // Skip if this line was already claimed by an earlier (more specific) field
      if (claimedLines.has(i)) continue;
      const matched = def.patterns.find(p => {
        // Word-boundary-aware match: pattern surrounded by non-alphanumeric or start/end
        const idx = lower.indexOf(p);
        if (idx === -1) return false;
        const before = idx === 0 || !/[a-z0-9]/.test(lower[idx - 1]);
        const after  = (idx + p.length) >= lower.length || !/[a-z0-9]/.test(lower[idx + p.length]);
        return before && after;
      });
      if (!matched) continue;

      // Try to find a number: prefer two-column monthly detection, fall back to trailing,
      // then currency-anchored (handles body-text sentences like "is ` 15,000/- per month").
      // When currency-anchored, also check if the line says "per month" — if so, mark as
      // monthly-direct so we don't ÷12 it in annual offer letters.
      const extractFromLine = (line) => {
        const tc = extractTwoColMonthly(line);
        if (tc != null) return { val: tc, monthly: true };
        const tr = extractNumber(line);
        if (tr != null) return { val: tr, monthly: false };
        const cu = extractCurrencyNumber(line);
        if (cu != null) {
          const isMonthlyLine = /per\s*month|\/month|\/mo\b/i.test(line);
          return { val: cu, monthly: isMonthlyLine };
        }
        // Last resort: Indian lakh format with no currency marker (handles OCR-garbled ₹)
        const ind = extractIndianFormatNumber(line);
        if (ind != null) {
          const isMonthlyLine = /per\s*month|\/month|\/mo\b/i.test(line);
          return { val: ind, monthly: isMonthlyLine };
        }
        return { val: null, monthly: false };
      };

      let { val, monthly: gotMonthlyDirect } = extractFromLine(raw);
      let valueLineIdx = i; // track which line the number came from
      for (let k = 1; k <= 4 && val == null; k++) {
        if (i + k < lines.length) {
          ({ val, monthly: gotMonthlyDirect } = extractFromLine(lines[i + k]));
          if (val != null) valueLineIdx = i + k;
        }
      }
      if (val == null) continue;

      if (def.isAnnual) {
        // Annual field: value should be stored as annual amount
        // If two-col gave us monthly, or doc is a payslip (not annual), scale up ×12
        if (gotMonthlyDirect || !likelyAnnual) val = Math.round(val * 12);
        // Sanity: annual values 1,000–50,00,000
        if (val < 1000 || val > 50000000) continue;
      } else {
        // Monthly field: convert annual → monthly if needed
        if (!gotMonthlyDirect && likelyAnnual && def.monthly) val = Math.round(val / 12);
        // Sanity: monthly values — basic up to 5L, others up to 2L, min 50
        const max = def.field === "basic" ? 500000 : 200000;
        if (val < 50 || val > max) continue;
      }

      const confidence = matched === def.patterns[0] ? "high" : "medium";
      if (def.accumulate && found[def.field]) {
        // Add to the existing value — keeps the first rawLine for display
        found[def.field] = {
          value: found[def.field].value + val,
          confidence: "medium",
          rawLine: found[def.field].rawLine,
        };
      } else {
        found[def.field] = { value: val, confidence, rawLine: raw.trim() };
      }
      // Mark both the label line and the value line as consumed.
      // This prevents the value line from matching again independently
      // (e.g. "Personal Allowance: ..." on its own line matching after the header did).
      claimedLines.add(i);
      if (valueLineIdx !== i) claimedLines.add(valueLineIdx);
    }

    // ── Meta fields (gross / net) — also try mid-sentence extraction ──
    for (const def of META_PATTERNS) {
      if (meta[def.field]) continue;
      const lowerClean = lower.replace(/[:\-–|]/g, " ");
      if (def.patterns.some(p => lowerClean.includes(p))) {
        // Track which line the value came from so we can check its annual context
        const metaExtract = (l) => extractNumber(l) ?? extractCurrencyNumber(l) ?? extractIndianFormatNumber(l);
        let val = metaExtract(raw);
        let valLine = raw;
        for (let k = 1; k <= 2 && val == null; k++) {
          if (i + k < lines.length) {
            val = metaExtract(lines[i + k]);
            if (val != null) valLine = lines[i + k];
          }
        }
        if (val && val > 0) {
          if (def.monthly) {
            // Explicitly monthly field (e.g. monthlyGross) — never divide by 12
            meta[def.field] = val;
          } else {
            // Per-line annual check: if the value's line mentions "per annum" / "p.a." / "annual",
            // treat as annual even if the document-level likelyAnnual is false.
            // This handles TCS-style letters where "Monthly" appears as a column header
            // but the body says "7,00,022/- per annum".
            const lineIsAnnual = /per\s*annum|p\.a\.|\/yr|\bannual\b/i.test(valLine);
            meta[def.field] = (likelyAnnual || lineIsAnnual) ? Math.round(val / 12) : val;
          }
        }
      }
    }
  }

  // ── BoB (Bouquet of Benefits) post-processing ──────────────────────────────
  // TCS and similar companies group allowances under a "Bouquet of Benefits" umbrella
  // with a breakdown in a separate table (Table 2 / next page).
  //
  // Scenario A: both tables are text-readable.
  //   "Bouquet of Benefits 26,522" → assigned to `special` (BoB container total).
  //   Table 2 then gives HRA, LTA, medical, telephone — sub-components of BoB.
  //   Without post-processing: BoB total stays in `special` AND sub-components added → double-count.
  //   Fix: deduct sub-components from `special` (BoB total) → remainder = flexi/special within BoB.
  //
  // Scenario B: Table 1 is image-based, Table 2 is text.
  //   Sub-components found from Table 2 but `special` not set from BoB.
  //   No double-counting issue here — sub-components are correct.
  const BOB_SUB_FIELDS = ["hra", "lta", "medical", "telephone", "meal", "cea", "conveyance"];
  if (found.special && /bouquet/i.test(found.special.rawLine || "")) {
    const subSum = BOB_SUB_FIELDS.reduce((s, k) => s + (found[k]?.value || 0), 0);
    if (subSum > 0) {
      const remainder = found.special.value - subSum;
      if (remainder <= found.special.value * 0.15) {
        // Sub-components cover ≥85% of BoB → fully broken down; discard the container total
        delete found.special;
      } else if (remainder > 100) {
        // Partial breakdown (e.g. Flexible/Special allowance within BoB not separately parsed)
        // Keep the remainder as `special` — this is the flexi/other portion of BoB
        found.special = { value: remainder, confidence: "low", rawLine: "Remainder of Bouquet of Benefits after sub-component breakdown" };
      }
    }
  }

  // ── Gross salary fallback ────────────────────────────────────────────────
  // If no components found but we have a gross figure (e.g. TCS body text),
  // estimate the breakdown. Priority:
  //   1. explicitMonthlyGross — body text states "Monthly Gross Salary ₹51,844" (most accurate)
  //   2. meta.grossPay ÷12 — annual CTC divided by 12 (may include non-recurring items)
  // Split: Basic 40% · HRA 20% · Special 40%
  const hasComponents = Object.keys(found).some(k => k !== "profTaxMonthly");
  // Priority: explicit monthly gross regex match > meta.monthlyGross (from monthly gross line)
  // > meta.grossPay (annual CTC ÷12, least accurate — may include non-recurring items)
  const fallbackGross = explicitMonthlyGross
    || (meta.monthlyGross && meta.monthlyGross > 0 ? meta.monthlyGross : null)
    || (meta.grossPay    && meta.grossPay    > 0 ? meta.grossPay    : null);
  if (!hasComponents && fallbackGross) {
    meta.grossPay = fallbackGross;
    meta.estimatedFromGross = true;

    // Detect BoB-style structure (TCS, some other companies):
    // "Bouquet of Benefits" bundles HRA/LTA/medical inside it — no standalone HRA at top level.
    // Use a 2-part split: Basic ~35% / Special (BoB) ~65%.
    // Otherwise use the standard 3-part: Basic 40% / HRA 20% / Special 40%.
    const hasBoBStructure = /bouquet\s+of\s+benefits|bouquet\s+of\s+benefit|\bboa\b/i.test(fullText);
    if (hasBoBStructure) {
      found["basic"]   = { value: Math.round(fallbackGross * 0.35), confidence: "low", rawLine: "Estimated from gross (BoB structure)" };
      found["special"] = { value: Math.round(fallbackGross * 0.65), confidence: "low", rawLine: "Estimated from gross (BoB structure)" };
    } else {
      found["basic"]   = { value: Math.round(fallbackGross * 0.40), confidence: "low", rawLine: "Estimated from gross salary" };
      found["hra"]     = { value: Math.round(fallbackGross * 0.20), confidence: "low", rawLine: "Estimated from gross salary" };
      found["special"] = { value: Math.round(fallbackGross * 0.40), confidence: "low", rawLine: "Estimated from gross salary" };
    }
  }

  // ── Basic salary estimation when BoB sub-components found but basic missing ──
  // Scenario: Table 2 gave us HRA/LTA/medical etc. (hasComponents=true → no gross fallback)
  // but Table 1 is image-based so basic wasn't found.
  // Estimate: basic ≈ monthlyGross − sum(found monthly components).
  if (!found.basic && fallbackGross && hasComponents) {
    const monthlyFields = FIELD_PATTERNS.filter(d => !d.isAnnual && !d.isDeduction && d.field !== "basic");
    const componentSum = monthlyFields.reduce((s, d) => s + (found[d.field]?.value || 0), 0);
    const estimatedBasic = Math.round(fallbackGross - componentSum);
    if (estimatedBasic >= 1000 && estimatedBasic <= 500000) {
      found.basic = { value: estimatedBasic, confidence: "low", rawLine: "Estimated from gross minus found components" };
      meta.estimatedFromGross = true;
      meta.grossPay = fallbackGross;
    }
  }

  // ── BoB (Special Allowance) estimation when basic is known but BoB not extracted ──
  // Scenario: body text gave us basic (15,000) + MPP (4,300) + QVA (3,100) but Table 1 is
  // image-based so BoB total (26,522) wasn't found.
  // Formula: BoB ≈ monthlyGross − basic − MPP − QVA_monthly − PF_employer − gratuity
  // PF employer and gratuity are calculable from basic: 12% + 4.81%.
  // The estimate may be off by small items (city allowance, health insurance amortised)
  // but is typically within 2–5% of the actual BoB.
  if (!found.special && found.basic && fallbackGross) {
    const basicVal    = found.basic.value;
    const pfMonthly   = Math.round(basicVal * 0.12);
    const gratMonthly = Math.round(basicVal * 0.0481);
    const mppMonthly  = found.perfPay?.value || 0;
    const qvaMonthly  = Math.round((found.variable?.value || 0) / 12); // variable stored as annual
    const otherMonthly = FIELD_PATTERNS
      .filter(d => !d.isAnnual && !d.isDeduction && !["basic","special","perfPay"].includes(d.field))
      .reduce((s, d) => s + (found[d.field]?.value || 0), 0);
    const bobEstimate = Math.round(
      fallbackGross - basicVal - mppMonthly - qvaMonthly - pfMonthly - gratMonthly - otherMonthly
    );
    if (bobEstimate >= 1000 && bobEstimate <= 200000) {
      found.special = { value: bobEstimate, confidence: "low", rawLine: "Estimated: monthly gross minus basic, MPP, QVA, PF, gratuity" };
      meta.estimatedFromGross = true;
      meta.grossPay = fallbackGross;
    }
  }

  return { found, meta, likelyAnnual, debugLines: lines };
}

// ── Confidence badge ──────────────────────────────────────────────────────
function ConfBadge({ conf }) {
  if (conf === "high")   return <span className="dp-conf dp-conf--high"  title="High confidence">●</span>;
  if (conf === "medium") return <span className="dp-conf dp-conf--med"   title="Verify this">◐</span>;
  if (conf === "low")    return <span className="dp-conf dp-conf--med"   title="Estimated — verify">~</span>;
  return                        <span className="dp-conf dp-conf--none"  title="Not found">○</span>;
}

// ── Main component ────────────────────────────────────────────────────────
function DocUploader({ onApply }) {
  const [stage, setStage] = dpUseState("idle"); // idle|reading|review|applied|error
  const [readingMsg, setReadingMsg] = dpUseState("Reading your document…");
  const [dragOver, setDragOver] = dpUseState(false);
  const [extraction, setExtraction] = dpUseState(null);
  const [editVals, setEditVals] = dpUseState({});
  const [errMsg, setErrMsg] = dpUseState("");
  const [debugLines, setDebugLines] = dpUseState([]);
  const [showDebug, setShowDebug] = dpUseState(false);
  const inputRef = dpUseRef();

  const processFile = dpUseCallback(async (file) => {
    if (!file) return;
    if (file.type !== "application/pdf") {
      setErrMsg("Please upload a PDF — scanned images are supported via OCR.");
      setStage("error");
      return;
    }
    setReadingMsg("Reading your document…");
    setStage("reading");
    try {
      const lines = await extractTextFromPDF(file, (msg) => setReadingMsg(msg));
      const { found, meta, likelyAnnual, debugLines: dl } = extractSalaryFields(lines);
      setDebugLines(dl);
      console.log("[doc-parser] extracted lines:", dl);
      console.log("[doc-parser] found fields:", found);

      const foundCount = Object.keys(found).length;
      if (foundCount === 0) {
        // Surface a useful diagnosis based on what we detected
        const hasText = dl.filter(l => l && l !== "---page---").join("").replace(/\s/g, "").length > 50;
        const isScanned = !hasText;
        const isSecure = window.location.protocol === "https:" || window.location.hostname === "localhost";
        let msg = "Couldn't extract salary fields from this PDF.";
        if (isScanned && !isSecure) {
          msg = "This is a scanned PDF — OCR is needed but only works on the live site (myinhand.app), not a local file. Open the app at myinhand.app and try again.";
        } else if (isScanned) {
          msg = "Scanned PDF — OCR ran but found no salary data. This may be a cover-letter-only document (no Annexure). Check if your PDF includes the salary breakdown pages.";
        } else {
          msg = "No salary fields found. This may be a cover-letter-only document — check that your PDF includes the salary breakdown / Annexure page.";
        }
        setErrMsg(msg);
        setStage("error");
        return;
      }
      const edits = {};
      for (const [k, v] of Object.entries(found)) edits[k] = String(v.value);
      setEditVals(edits);
      setExtraction({ found, meta, likelyAnnual, fileName: file.name, foundCount });
      setStage("review");
    } catch (e) {
      console.error("[doc-parser] error:", e);
      const isOCRFail = e.message && e.message.startsWith("OCR_FAILED");
      let msg;
      if (isOCRFail) {
        msg = "OCR engine couldn't start — this scanned PDF needs internet access to load the OCR model. Check your connection and try again.";
      } else {
        msg = "Failed to read this PDF. Make sure it's not password-protected.";
      }
      setErrMsg(msg);
      setStage("error");
    }
  }, []);

  const handleDrop = dpUseCallback((e) => {
    e.preventDefault();
    setDragOver(false);
    processFile(e.dataTransfer.files[0]);
  }, [processFile]);

  const handleApply = () => {
    const monthly = {};
    const stateExtra = {};
    for (const def of FIELD_PATTERNS) {
      const raw = editVals[def.field];
      if (raw == null || raw === "") continue;
      const v = parseFloat(raw);
      if (isNaN(v) || v < 0) continue;
      if (def.field === "profTaxMonthly") {
        stateExtra.profTaxMonthly = v;
      } else if (def.isAnnual) {
        // Annual fields go to top-level state. Use stateField if set (e.g. joiningBonus → bonusAnnual).
        // Accumulate if multiple parsed fields map to the same state key.
        const key = def.stateField || def.field;
        stateExtra[key] = (stateExtra[key] || 0) + v;
      } else {
        monthly[def.field] = v;
      }
    }
    onApply({ monthly, ...stateExtra });
    setStage("applied");
  };

  const reset = () => {
    setStage("idle"); setExtraction(null); setErrMsg("");
    setEditVals({}); setDebugLines([]); setShowDebug(false);
  };

  // ── Idle / drop zone ──────────────────────────────────────────────────
  if (stage === "idle" || stage === "error") return (
    <div
      className={`dp-zone ${dragOver ? "dp-zone--over" : ""} ${stage === "error" ? "dp-zone--error" : ""}`}
      onDragOver={e => { e.preventDefault(); setDragOver(true); }}
      onDragLeave={() => setDragOver(false)}
      onDrop={handleDrop}
      onClick={() => inputRef.current?.click()}
    >
      <input ref={inputRef} type="file" accept="application/pdf" style={{ display: "none" }}
        onChange={e => processFile(e.target.files[0])} />
      <div className="dp-zone__icon">📄</div>
      <div className="dp-zone__title">
        {stage === "error" ? errMsg : "Drop your payslip or offer letter here"}
      </div>
      <div className="dp-zone__sub">
        {stage === "error" ? (
          <span className="dp-zone__retry" onClick={e => { e.stopPropagation(); reset(); }}>Try again</span>
        ) : "PDF only · processed entirely in your browser · nothing uploaded"}
      </div>
      {stage === "error" && debugLines.length > 0 && (
        <div style={{ marginTop: 8 }} onClick={e => e.stopPropagation()}>
          <button
            className="dp-zone__retry"
            style={{ fontSize: 11, opacity: 0.6 }}
            onClick={e => { e.stopPropagation(); setShowDebug(d => !d); }}
          >
            {showDebug ? "Hide" : "Show"} extracted text
          </button>
          {showDebug && (
            <pre style={{ fontSize: 10, textAlign: "left", opacity: 0.7, marginTop: 8, maxHeight: 200, overflow: "auto", whiteSpace: "pre-wrap" }}>
              {debugLines.map(l => l === "---page---" ? "\n── page break ──\n" : l).join("\n")}
            </pre>
          )}
        </div>
      )}
      {stage !== "error" && (
        <button className="dp-zone__btn" onClick={e => e.stopPropagation()}>Choose PDF</button>
      )}
    </div>
  );

  // ── Reading ───────────────────────────────────────────────────────────
  if (stage === "reading") return (
    <div className="dp-loading">
      <div className="dp-loading__spinner" />
      <div className="dp-loading__text">{readingMsg}</div>
      <div className="dp-loading__sub">
        {readingMsg.toLowerCase().includes("ocr")
          ? "Running OCR client-side — nothing leaves your browser. This takes ~30–60 s for scanned PDFs."
          : "Extracting salary fields client-side. Nothing leaves your browser."}
      </div>
    </div>
  );

  // ── Applied ───────────────────────────────────────────────────────────
  if (stage === "applied") return (
    <div className="dp-applied">
      <span className="dp-applied__tick">✓</span>
      <span>Fields applied — scroll down to verify the CTC breakdown.</span>
      <button className="dp-applied__reset" onClick={reset}>Upload another</button>
    </div>
  );

  // ── Review ────────────────────────────────────────────────────────────
  const { found, meta, likelyAnnual, fileName, foundCount } = extraction;
  // Annual fields (bonusAnnual, variable) are optional add-ons — don't clutter the "not found" list
  const notFound = FIELD_PATTERNS.filter(d => !found[d.field] && !d.isDeduction && !d.isAnnual);

  return (
    <div className="dp-review">
      <div className="dp-review__head">
        <div>
          <div className="dp-review__title">
            ✓ Found {foundCount} field{foundCount !== 1 ? "s" : ""} in <span className="dp-review__fname">{fileName}</span>
          </div>
          <div className="dp-review__hint">
            {meta.estimatedFromGross
              ? `Breakdown estimated from ${meta.monthlyGross ? "monthly gross" : "annual CTC÷12"} (₹${(meta.monthlyGross || meta.grossPay)?.toLocaleString("en-IN")}/mo) — table not readable. Please verify.`
              : likelyAnnual ? "Looks like an offer letter — annual values converted to monthly." : "Payslip detected — monthly values."}
            {!meta.estimatedFromGross && " Review and edit before applying."}
          </div>
        </div>
        <button className="dp-review__cancel" onClick={reset}>✕ Cancel</button>
      </div>

      <div className="dp-review__grid">
        {FIELD_PATTERNS.filter(d => found[d.field]).map(def => (
          <div key={def.field} className="dp-review__row">
            <ConfBadge conf={found[def.field].confidence} />
            <span className="dp-review__lbl">{def.label}</span>
            <div className="dp-review__input-wrap">
              <span className="dp-review__curr">₹</span>
              <input
                type="number"
                className="dp-review__input"
                value={editVals[def.field] ?? ""}
                min={0}
                onChange={e => setEditVals(v => ({ ...v, [def.field]: e.target.value }))}
              />
              <span className="dp-review__unit">{def.isAnnual ? "/yr" : "/mo"}</span>
            </div>
          </div>
        ))}
        {notFound.map(def => (
          <div key={def.field} className="dp-review__row dp-review__row--missing">
            <ConfBadge conf="none" />
            <span className="dp-review__lbl">{def.label}</span>
            <div className="dp-review__input-wrap">
              <span className="dp-review__curr">₹</span>
              <input
                type="number"
                className="dp-review__input"
                placeholder="enter manually"
                value={editVals[def.field] ?? ""}
                min={0}
                onChange={e => setEditVals(v => ({ ...v, [def.field]: e.target.value }))}
              />
              <span className="dp-review__unit">/mo</span>
            </div>
          </div>
        ))}
      </div>

      {(meta.grossPay || meta.netPay) && (
        <div className="dp-review__meta">
          {meta.grossPay && <span>Gross from doc: <strong>₹{meta.grossPay.toLocaleString("en-IN")}</strong>/mo</span>}
          {meta.netPay   && <span>Net from doc: <strong>₹{meta.netPay.toLocaleString("en-IN")}</strong>/mo</span>}
          <span className="dp-review__meta-note">Shown for reference only.</span>
        </div>
      )}

      <div className="dp-review__actions">
        <button className="dp-review__apply" onClick={handleApply}>
          Apply to calculator →
        </button>
        <button className="dp-review__skip" onClick={reset}>Enter manually instead</button>
      </div>

      {/* Debug panel — helps diagnose parsing issues */}
      {debugLines.length > 0 && (
        <div style={{ marginTop: 12 }}>
          <button
            className="dp-zone__retry"
            style={{ fontSize: 11, opacity: 0.5 }}
            onClick={() => setShowDebug(d => !d)}
          >
            {showDebug ? "Hide" : "Show"} extracted text ({debugLines.filter(l => l && l !== "---page---").length} lines across {debugLines.filter(l => l === "---page---").length} pages)
          </button>
          {showDebug && (
            <pre style={{ fontSize: 10, textAlign: "left", opacity: 0.65, marginTop: 8, maxHeight: 360, overflow: "auto", whiteSpace: "pre-wrap", background: "rgba(255,255,255,0.04)", padding: 10, borderRadius: 8 }}>
              {debugLines.map((l, i) =>
                l === "---page---"
                  ? `\n── page break ──\n`
                  : l
              ).join("\n")}
            </pre>
          )}
        </div>
      )}
    </div>
  );
}

Object.assign(window, { DocUploader });