import os
import json
import pdfplumber
from difflib import get_close_matches

# Folder sample PDF dan file keyword
SAMPLES_FOLDER = "samples"
KEYWORDS_FILE = "template_keywords.json"
OUTPUT_FILE = "template_coords.json"

# Ambil keyword dari file
with open(KEYWORDS_FILE, "r", encoding="utf-8") as f:
    keyword_data = json.load(f)

output = {}

def find_keyword_coords(pdf, keywords):
    coords = {}
    for page_num, page in enumerate(pdf.pages):
        text_lines = page.extract_text().split("\n")
        for keyword in keywords:
            # Cari kemiripan dengan teks di halaman
            match = get_close_matches(keyword, text_lines, n=1, cutoff=0.8)
            if match:
                matched_text = match[0]
                # Temukan koordinat tepatnya
                for word in page.extract_words():
                    if matched_text in word["text"]:
                        coords[keyword] = {
                            "x": float(word["x0"]),
                            "y": float(word["top"]),
                            "page": page_num
                        }
                        break
    return coords

for filename in os.listdir(SAMPLES_FOLDER):
    if filename.endswith("_SAMPLE.pdf"):
        sample_path = os.path.join(SAMPLES_FOLDER, filename)
        print(f"🔍 Memproses: {filename}")
        template_key = filename.replace("_SAMPLE.pdf", "")
        try:
            with pdfplumber.open(sample_path) as pdf:
                keywords = keyword_data.get(filename.replace(".pdf", ""), {}).get("keywords", [])
                coords = find_keyword_coords(pdf, keywords)
                output[f"{template_key}_BLANK"] = {"coordinates": coords}
        except Exception as e:
            print(f"❌ Gagal memproses {filename}: {e}")

# Simpan hasilnya
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(output, f, indent=2)

print("✅ Koordinat berhasil disimpan ke template_coords.json")
