#!/usr/bin/env python3
# ruff: noqa
"""
Script to convert a LSDALTON molecule input file into a standard XYZ file.

The LSDALTON input file has a header that may contain "Angstrom" (or "Ang")
to indicate that the coordinates are provided in Angstrom. If that token is not found,
then the coordinates are assumed to be given in Bohr and are converted to Angstrom
using 1 bohr = 0.52917721092 Angstrom.

The molecule input file consists of several blocks. Each block starts with a line
of the form "Charge=... Atoms=N" followed by N lines, each with an element symbol
and three coordinates.
"""

import argparse
import re

# Conversion factor from bohr to angstrom
BOHR_TO_ANGSTROM: float = 0.52917721092


def parse_lsdalton_file(lines: list[str]) -> tuple[bool, list[tuple[str, float, float, float]]]:
    """
    Parse the lines of a LSDALTON molecule input file.

     The function detects whether the input coordinates are in Angstrom (if the header contains "ang")
     or in Bohr (otherwise) and then reads the atomic blocks.

    Args:
       lines: List of strings representing the lines of the input file.

    Returns:
       A tuple where the first element is a boolean indicating if the coordinates are in Angstrom,
       and the second element is a list of tuples (element, x, y, z) in Angstrom.

    """
    # Default: assume coordinates are given in Bohr.
    coords_in_angstrom = False

    # Look for a header line containing "atomtypes=".
    header_index = None
    for idx, line in enumerate(lines):
        if line.strip().lower().startswith("atomtypes="):
            header_index = idx
            # If the header line contains "ang", assume the coordinates are in Angstrom.
            if "ang" in line.lower():
                coords_in_angstrom = True
            break

    # If no header line is found, the code will assume Bohr coordinates.
    if header_index is None:
        header_index = 0

    # Prepare list to store atomic coordinates.
    atoms: list[tuple[str, float, float, float]] = []

    # Process remaining lines starting after the header.
    idx = header_index + 1
    while idx < len(lines):
        line = lines[idx].strip()
        # Look for block header lines that start with "Charge=" (case insensitive).
        if line.lower().startswith("charge="):
            # Use regex to extract the number of atoms after "Atoms="
            atoms_match = re.search(r"Atoms\s*=\s*(\d+)", line, re.IGNORECASE)
            if not atoms_match:
                msg = f"Could not find number of atoms in block header: {line}"
                raise ValueError(msg)
            num_atoms = int(atoms_match.group(1))
            idx += 1
            # Process the next num_atoms lines as atomic entries.
            for _ in range(num_atoms):
                if idx >= len(lines):
                    msg = "Unexpected end of file while reading atomic coordinates."
                    raise ValueError(msg)
                atom_line = lines[idx].strip()
                if not atom_line:
                    idx += 1
                    continue
                tokens = atom_line.split()
                if len(tokens) < 4:
                    msg = f"Invalid atomic line: {atom_line}"
                    raise ValueError(msg)
                # First token is the element symbol; next three are coordinates.
                element = tokens[0]
                try:
                    x = float(tokens[1])
                    y = float(tokens[2])
                    z = float(tokens[3])
                except ValueError as ex:
                    msg = f"Error converting coordinates in line: {atom_line}"
                    raise ValueError(msg) from ex
                atoms.append((element, x, y, z))
                idx += 1
        else:
            idx += 1

    # If the coordinates are given in Bohr, convert them to Angstrom.
    if not coords_in_angstrom:
        atoms = [
            (element, x * BOHR_TO_ANGSTROM, y * BOHR_TO_ANGSTROM, z * BOHR_TO_ANGSTROM) for (element, x, y, z) in atoms
        ]
    return coords_in_angstrom, atoms


def write_xyz_file(output_filename: str, atoms: list[tuple[str, float, float, float]]) -> None:
    """
    Write the atomic coordinates to an XYZ file.

     The output file starts with the number of atoms followed by a comment line,
     then one line per atom with the atomic symbol and x, y, z coordinates.

    Args:
       output_filename: The path to the output XYZ file.
       atoms: List of tuples (element, x, y, z) in Angstrom.

    """
    with open(output_filename, "w", encoding="utf-8") as f_out:
        # Write total number of atoms
        f_out.write(f"{len(atoms)}\n")
        # Write a comment line
        f_out.write("Converted from LSDALTON input file\n")
        # Write each atom with 6-decimal precision for coordinates.
        for element, x, y, z in atoms:
            f_out.write(f"{element} {x:.6f} {y:.6f} {z:.6f}\n")


def convert_lsdalton_to_xyz(input_filename: str, output_filename: str) -> None:
    """
    Convert an LSDALTON molecule input file to a standard XYZ file.

     This function reads the input file, processes the header and atomic blocks,
     performs the conversion if necessary, and writes the resulting XYZ file.

    Args:
       input_filename: Path to the LSDALTON input file.
       output_filename: Path to the output XYZ file.

    """
    with open(input_filename, encoding="utf-8") as f_in:
        lines = f_in.readlines()

    _, atoms = parse_lsdalton_file(lines)
    write_xyz_file(output_filename, atoms)


def main() -> None:
    """Parse command-line arguments and run the conversion."""
    parser = argparse.ArgumentParser(description="Convert LSDALTON molecule input file to XYZ format.")
    parser.add_argument(
        "input_file",
        help="Path to the LSDALTON input file.",
    )
    parser.add_argument(
        "output_file",
        help="Path for the generated XYZ file.",
    )
    args = parser.parse_args()
    convert_lsdalton_to_xyz(args.input_file, args.output_file)


# Unit tests using the absl testing framework.
if __name__ == "__main__":
    # If the script is invoked with arguments, run main; otherwise run tests.
    main()
