# -*- coding: utf-8 -*-
from unstructured_inference.inference.layout import DocumentLayout
import dataiku
import tempfile
import os
import shutil
import pandas as pd

# Read input folder containing image files
folder = dataiku.Folder("FAUwCSvY")

# Initialize an empty DataFrame to store the layout detection results
output_df = pd.DataFrame(columns=["x1", "y1", "x2", "y2", "prob", "type", "file_path"])

# Loop through each file in the folder and apply layout analysis
for file_path in folder.list_paths_in_partition():
    # Create a temporary directory to store files locally during processing
    with tempfile.TemporaryDirectory() as tmpdirname:
        # Create the full local path where the file will be stored temporarily
        local_file_path = tmpdirname + file_path
        if not os.path.exists(os.path.dirname(local_file_path)):
            os.makedirs(
                os.path.dirname(local_file_path)
            )  # Create any missing directories

        # Copy file from the remote Dataiku folder to the local temporary directory
        with folder.get_download_stream(file_path) as f_remote:
            with open(local_file_path, "wb") as f_local:
                shutil.copyfileobj(f_remote, f_local)  # Copy the content of the file

        # Load the image and analyze its layout using `DocumentLayout`
        layout = DocumentLayout.from_image_file(local_file_path)

    # Initialize lists to store bounding box coordinates, probabilities, and categories for each element
    x1s, y1s, x2s, y2s, probs, categories = [], [], [], [], [], []

    # Loop through each element in the first page of the layout
    for element in layout.pages[0].elements:
        bbox = element.bbox  # Extract the bounding box coordinates of the element
        x1s.append(bbox.x1)
        y1s.append(bbox.y1)
        x2s.append(bbox.x2)
        y2s.append(bbox.y2)
        probs.append(
            element.prob
        )  # Extract the probability of the element being detected
        categories.append(element.type)  # Extract the type/category of the element

    # Create a DataFrame for the extracted data
    df = pd.DataFrame(
        {
            "x1": x1s,
            "y1": y1s,
            "x2": x2s,
            "y2": y2s,
            "prob": probs,
            "type": categories,
            "file_path": file_path,  # Add the file path to keep track of the source
        }
    )

    # Concatenate the results into the output DataFrame
    output_df = pd.concat([output_df, df], ignore_index=True)

# Write the results to a Dataiku dataset
dataiku.Dataset("output_region_unstructured").write_with_schema(output_df)
