import mammoth from "mammoth";
import { createWorker, PSM, OEM } from "tesseract.js";

import { getDocument, GlobalWorkerOptions } from "pdfjs-dist";

GlobalWorkerOptions.workerSrc = "/pdf.worker.mjs";

async function extractTextFromPDF(file) {
  try {
    const arrayBuffer = await file.arrayBuffer();
    const pdf = await getDocument({ data: arrayBuffer }).promise;

    // Return an array of page text promises
    const pages = [];
    for (let i = 1; i <= pdf.numPages; i++) {
      pages.push(extractPageText(pdf, i));
    }

    return {
      numPages: pdf.numPages,
      pages: await Promise.all(pages),
    };
  } catch (error) {
    throw new Error("Failed to process PDF: " + error.message);
  }
}

async function extractPageText(pdf, pageNum) {
  const page = await pdf.getPage(pageNum);
  const textContent = await page.getTextContent();

  const structuredText = textContent.items.map((item) => ({
    text: item.str,
    x: Math.round(item.transform[4]),
    y: Math.round(item.transform[5]),
  }));

  structuredText.sort((a, b) => {
    if (Math.abs(a.y - b.y) > 10) {
      return b.y - a.y;
    }
    return a.x - b.x;
  });

  const pageText = structuredText.reduce((acc, item, index, arr) => {
    const nextItem = arr[index + 1];
    let text = item.text;

    if (nextItem && Math.abs(item.y - nextItem.y) > 10) {
      text += "\n";
    } else if (
      nextItem &&
      !item.text.endsWith(" ") &&
      !nextItem.text.startsWith(" ")
    ) {
      text += " ";
    }

    return acc + text;
  }, "");

  return {
    pageNum,
    text: pageText.trim(),
  };
}

async function convertToBase64(file) {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.readAsDataURL(file);
    reader.onload = () => resolve(reader.result);
    reader.onerror = (error) => reject(error);
  });
}

async function extractTextFromImage(file) {
  let worker = null;
  try {
    // Create and initialize worker using the new API
    worker = await createWorker();

    // Convert file to URL for Tesseract
    const imageUrl = URL.createObjectURL(file);

    try {
      // Recognize text in the image using the new API
      const { data } = await worker.recognize(imageUrl);

      return {
        text: data.text,
        words:
          data.words?.map((word) => ({
            text: word.text,
            confidence: word.confidence,
            bounds: word.bbox,
          })) || [],
        paragraphs:
          data.paragraphs?.map((para) => ({
            text: para.text,
            confidence: para.confidence,
            bounds: para.bbox,
          })) || [],
        confidence: data.confidence,
        html: convertToStructuredHtml(data),
      };
    } finally {
      // Clean up the object URL
      URL.revokeObjectURL(imageUrl);
    }
  } catch (error) {
    throw new Error(`Failed to process image file: ${error.message}`);
  } finally {
    // Terminate the worker if it was created
    if (worker) {
      await worker.terminate();
    }
  }
}

// Helper function to convert OCR data to structured HTML
function convertToStructuredHtml(ocrData) {
  let html = '<div class="ocr-text">';

  // Check if paragraphs exist before processing
  if (ocrData.paragraphs && ocrData.paragraphs.length > 0) {
    ocrData.paragraphs.forEach((paragraph) => {
      html += `<p class="ocr-paragraph" 
        data-confidence="${paragraph.confidence}"
        style="position: relative; 
               top: ${paragraph.bbox.y0}px; 
               left: ${paragraph.bbox.x0}px">${paragraph.text}</p>`;
    });
  } else {
    // Fallback to just using the raw text if no paragraph structure
    html += `<p class="ocr-paragraph">${ocrData.text}</p>`;
  }

  html += "</div>";
  return html;
}

export const FileUtils = {
  extractTextFromPDF,
  extractTextFromImage,
  convertToBase64,
};
