import logging
import os
import shutil

from dataiku.code_env_resources import set_env_var

# Set-up logging
logging.basicConfig()
logger = logging.getLogger("code_env_resources.document_extraction")
logger.setLevel(logging.INFO)

# For offline execution of docling, the user may want to download the models in advance, and place it in a specific folder.
set_env_var("DOCUMENT_EXTRACTION_MODELS", "document_extraction_models", True)

dir_path = os.environ.get("DOCUMENT_EXTRACTION_MODELS")

os.makedirs(dir_path, exist_ok=True)

# For tesseract, the tesseract binary should be installed in the system. the TESSDATA_PREFIX env var should point to the tessdata folder containing the .traineddata files.
# If the TESSDATA_PREFIX variable is already set, we do nothing because the user probably knows what they are doing.
# If not, we try to find a reasonable default value depending on the OS distribution. We then copy the tessdata files to the codeenv resources if we find them.
# Copying the tessdata files to the codeenv resources is useful because it allows user to add additional languages directly from the UI of the codeenv.

def check_tessdata_files(path: str):
    """Check if the given path contains any .traineddata files.
    Having .traineddata files is necessary for tesseract to work properly.
    """
    try:
        tessdata_files = os.listdir(path)
        return any(fname.endswith('.traineddata') for fname in tessdata_files)
    except FileNotFoundError:
        return False

def find_possible_path_for_tessdata():
    """
    Try to find the tessdata folder path depending on the OS distribution.
    :return: None if no possible path found, else a possible path to the tessdata folder.
    """
    import platform
    sys = platform.system()
    if sys != "Linux":
        logger.info("OS not supported for tesseract, to use tesseract please set TESSDATA_PREFIX env var manually")
        return None
    try:
        id_like = ""
        with open("/etc/os-release", "r") as os_release_file:
            for line in os_release_file:
                if "ID_LIKE=" in line:
                    _, id_like = line.strip().split("=", 1)
                    break
                else:
                    if "ID=" in line:
                        # There is no ID_LIKE for debian, we rely on the ID
                        _, os_id = line.strip().split("=", 1)
                        if os_id == "debian":
                            id_like = "debian"
                            break

        if "rhel" in id_like.lower() or "fedora" in id_like.lower():
            return "/usr/share/tesseract/tessdata/"
        elif "debian" in id_like.lower() or "suse" in id_like.lower():
            try:
                import subprocess
                command = ["dpkg -L tesseract-ocr-eng | grep '/tessdata$'"] if id_like.lower() == "debian" else ["rpm -ql tesseract-ocr | grep '/tessdata$'"]
                result = subprocess.run(
                    command,
                    check=True,
                    shell=True,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    text=True
                )
                tessdata_dirs = [line for line in result.stdout.splitlines()
                                 if "tessdata" in line and os.path.isdir(line)]
                if len(tessdata_dirs) == 1:
                    # should end with a slash
                    return tessdata_dirs[0] if tessdata_dirs[0].endswith("/") else tessdata_dirs[0] + "/"
                elif len(tessdata_dirs) == 0:
                    return None
                else:
                    logger.warning("Found more than one tessdata directory, please set TESSDATA_PREFIX env var manually")
                    return None
            except Exception as e:
                logger.error("Could not find the tessdata folder, please set TESSDATA_PREFIX env var manually: %s", e)
                return None
        else:
            logger.info("OS distribution not recognized, for tesseract usage please set TESSDATA_PREFIX env var manually")
            return None
    except Exception as e:
        logger.error("Could not find the tesseract tessdata folder. please set TESSDATA_PREFIX env var manually: %s", e)
    return None


def copy_tessdata_files(src: str, dest: str):
    """
    Copy the tessdata files from src to dest.
    :param src: source path
    :param dest: destination path
    :return: None
    """
    try:
        shutil.copytree(src, dest, dirs_exist_ok=True)
        logger.info(f"Copied tessdata files from {src} to {dest}")
    except Exception as e:
        logger.error(f"Could not copy the tessdata folder from {src} to {dest}. Please set TESSDATA_PREFIX env var manually. {e}")

try:
    tesseract_path = shutil.which("tesseract")
except Exception as e:
    tesseract_path = None
    logger.info("Could not find tesseract binary: " + str(e))

if tesseract_path is not None:
    logger.info("Tesseract binary found at: " + tesseract_path)
    if os.environ.get("TESSDATA_PREFIX") is not None:
        if check_tessdata_files(os.environ.get("TESSDATA_PREFIX")):
            logger.info("TESSDATA_PREFIX is set to " + os.environ.get(
                "TESSDATA_PREFIX") + " and contains .traineddata files. Will not copy the tessdata folder to the codeenv resources.")
        else:
            logger.warning("TESSDATA_PREFIX is set to " + os.environ.get(
                "TESSDATA_PREFIX") + " but does not contain any .traineddata files. Please fix the TESSDATA_PREFIX env var.")
    else:
        logger.info("TESSDATA_PREFIX is not set. Will try to find the tessdata folder.")
        possible_tessdata_path = find_possible_path_for_tessdata()
        if possible_tessdata_path is not None:
            if check_tessdata_files(possible_tessdata_path):
                code_env_resources_path = os.path.join(dir_path, "tessdata/")
                logger.info(f"Found .traineddata files in " + possible_tessdata_path + ". Will copy the tessdata folder to the codeenv resources at {code_env_resources_path}.")
                copy_tessdata_files(possible_tessdata_path, code_env_resources_path)
            else:
                logger.info("No .traineddata files found in " + possible_tessdata_path + ". Please set the TESSDATA_PREFIX env var manually.")
