import pdfplumber
import re
from datetime import datetime
import os
from PIL import Image

def parse_entries(text):
    entries = []
    raw_entries = re.split(r'210 Nomor Permohonan', text)

    for block in raw_entries:
        data = {
            "nomor_permohonan": None,
            "tanggal_penerimaan": None,
            "tipe_merek": None,
            "nama_referensi_label_merek": None,
            "arti_bahasa": None,
            "uraian_warna": None,
        }

        # Nomor Permohonan
        match = re.search(r":\s*(DID\d+)", block)
        if match:
            data["nomor_permohonan"] = match.group(1)

        # Tanggal Penerimaan
        match = re.search(r"220 Tanggal Penerimaan\s*:\s*(\d{2}/\d{2}/\d{4} \d{2}:\d{2}:\d{2})", block)
        if match:
            try:
                data["tanggal_penerimaan"] = datetime.strptime(match.group(1), "%d/%m/%Y %H:%M:%S")
            except:
                pass

        # Tipe Merek
        match = re.search(r"Tipe Merek\s*:\s*(.+)", block)
        if match:
            data["tipe_merek"] = match.group(1).strip()

        # Nama Referensi Label Merek
        match = re.search(r"Nama Referensi Label Merek\s*:\s*(.+)", block)
        if match:
            data["nama_referensi_label_merek"] = match.group(1).strip()

        # Arti Bahasa
        match = re.search(r"566 Arti Bahasa\s*:\s*(.+)", block)
        if match:
            data["arti_bahasa"] = match.group(1).strip()

        # Uraian Warna
        match = re.search(r"591 Uraian Warna\s*:\s*(.+)", block)
        if match:
            data["uraian_warna"] = match.group(1).strip()

        if data["nomor_permohonan"]:  # hanya simpan kalau data utama ada
            entries.append(data)

    return entries


def extract_data_from_pdf(path):
    text = ""
    image_paths = []
    output_dir = "./uploads/images"
    os.makedirs(output_dir, exist_ok=True)

    with pdfplumber.open(path) as pdf:
        for i, page in enumerate(pdf.pages):
            # Ekstrak teks
            page_text = page.extract_text()
            if page_text:
                text += page_text + "\n"

            # Ekstrak gambar/logo
            for j, image in enumerate(page.images):
                bbox = (image["x0"], image["top"], image["x1"], image["bottom"])
                cropped = page.to_image().original.crop(bbox)
                img_path = os.path.join(output_dir, f"logo_page{i+1}_{j+1}.png")
                cropped.save(img_path, format="PNG")
                image_paths.append(img_path)

    # Ekstrak teks
    entries = parse_entries(text)

    # Asosiasikan logo ke data (urutan saja sementara)
    for idx, entry in enumerate(entries):
        if idx < len(image_paths):
            entry["logo_path"] = image_paths[idx]
        else:
            entry["logo_path"] = None

    return entries
