# SDFileHelpers.py - script functions developed for StarDrop Query Tool
# Use of this code is subject to the terms of the StarDrop License Agreement and use, copying or redistribution of this
# file for use by other persons or organisations is prohibited
# Copyright(C) Optibrium Ltd 2016
# stardrop-support@optibrium.com
#
"""
Module provides function for dealing with SD files.
"""
import app
from collections import OrderedDict
from CSVHelper import checkType, createContEntry
import re
import sys
from enum import Enum

try:
    appui = sys.modules["__main__"].appui
except:
    pass


class ColumnInfo:
    """A class to represent a column of information to be loaded into StarDrop.
    Estimates the type of the data in the column."""

    class ParseHint(Enum):
        """A substitute for an enum"""
        STRING = 1
        CONT = 2
        NON_SMILES_MOLECULE = 3
        SMILES_MOLECULE = 4
        DATETIME = 5

    def __init__(self, name, index, parse_hint):
        self.name = name
        self.columnIndex = index
        self.errorIndex = -1
        self.parse_hint = parse_hint


def extractTags(mol, headers):
    """Get the properties for a molecule from a MOL block, and add to list of headers if appropriate.
    @param: mol: Lines of text representing a molccule in a molfile.
    @param: headers: List of property names extracted so far.
    @returns: Dictionary of property values.
    """
    tags = {}
    header = ''
    regex = re.compile("^>.*<(.*)>")
    in_tags = False
    for line in mol:
        match = regex.match(line)
        if match:
            in_tags = True
            header = match.group(1)
            tags[header] = []
            if header not in headers:
                headers.append(header)
        elif in_tags and len(line.rstrip()) > 0:
            tags[header].append(line.rstrip())
    for h in list(tags.keys()):
        tags[h] = ", ".join(tags[h])
    return tags


def loadSDFile(input, config):
    """Convert contents of an sd file to headers and data.
    @param: input: The lines of text from the sd file.
    @returns: An ordered dictionary of
        keys: ColumnInfo objects (headers)
        values: raw data columns.
    """
    mol = []
    mols = []
    for line in input:
        if line.startswith("$$$$"):
            mols.append(mol)
            mol = []
        else:
            mol.append(line.rstrip())
    headers = ['Structure', 'ID']
    data = []
    for mol in mols:
        moldata = extractTags(mol, headers)
        moldata['Structure'] = '\n'.join(mol)
        moldata['ID'] = mol[0]
        data.append(moldata)
    rawdata = []
    for d in data:
        row = []
        for h in headers:
            row.append(d.get(h, ''))
        rawdata.append(row)

    column_infos = []
    for i, h in enumerate(headers):
        data_type = checkType([d[i] for d in rawdata], h, config)
        if data_type == app.cont:
            parse_hint = ColumnInfo.ParseHint.CONT
        elif data_type == app.mol:
            # Test first few molecules to see if they are SMILES
            number_of_mols_to_check = min(config.MAX_NUM_MOLECULES_TO_TEST_FOR_PARSING, len(rawdata))
            successes = 0
            for molecule in (d[i] for d in rawdata[0: number_of_mols_to_check]):
                if appui.getMolFileFromSmiles(molecule):
                    successes += 1
            if successes > number_of_mols_to_check / 2:
                parse_hint = ColumnInfo.ParseHint.SMILES_MOLECULE
            else:
                parse_hint = ColumnInfo.ParseHint.NON_SMILES_MOLECULE
        elif data_type == app.datetime:
            parse_hint = ColumnInfo.ParseHint.DATETIME
        else:
            parse_hint = ColumnInfo.ParseHint.STRING
        column_infos.append(ColumnInfo(h, i, parse_hint))

    dataset_ordered_dict = OrderedDict()
    for i, h in enumerate(column_infos):
        raw_datacol = []
        for raw_entry in [d[i] for d in rawdata]:
            raw_datacol.append(raw_entry)
        dataset_ordered_dict[h] = raw_datacol
    return dataset_ordered_dict


def createDatasetFromSDFile(appui, sd_lines, config):
    """Parse a sd file and load it into StarDrop. This function
    checks the type of the input data and attempts to create a
    column of the correct type in StarDrop.
    @param: appui: The interface to the StarDrop application
    @param: filename: the path to the file to be opened
    """
    dataset = app.Dataset()
    dataset_ordered_dict = loadSDFile(sd_lines, config)

    try:
        import ColumnFormatter
        if hasattr(config, 'STANDARD_DEVIATION_SUFFIX'):
            headers = ColumnFormatter.formatColumns(list(dataset_ordered_dict.keys()), config.STANDARD_DEVIATION_SUFFIX)
        else:
            headers = ColumnFormatter.formatColumns(list(dataset_ordered_dict.keys()))
    except:
        headers = list(dataset_ordered_dict.keys())

    for h in headers:
        datacol = []
        header_name = h.name
        if h.errorIndex != -1:
            details = ((0.0, 0.0), (0, 0))
            datacolumn = app.Dataset.meta(header_name, app.cont, details)
            for value, error in \
                    zip(dataset_ordered_dict[h], list(dataset_ordered_dict.values())[h.errorIndex]):
                try:
                    datacol.append(createContEntry(value, error))
                except:
                    datacol.append(app.Dataset.createInvalidEntry(app.cont))
        else:
            column_type = h.parse_hint
            if column_type == ColumnInfo.ParseHint.CONT:
                datacolumn = app.Dataset.createNumberHeader(header_name)
                for entry in dataset_ordered_dict[h]:
                    try:
                        datacol.append(createContEntry(entry))
                    except:
                        datacol.append(app.Dataset.createInvalidEntry(app.cont))
            elif column_type in (ColumnInfo.ParseHint.SMILES_MOLECULE, ColumnInfo.ParseHint.NON_SMILES_MOLECULE):
                datacolumn = app.Dataset.meta(header_name, app.mol)
                for entry in dataset_ordered_dict[h]:
                    if column_type == ColumnInfo.ParseHint.NON_SMILES_MOLECULE:
                        datacol.append(app.Dataset.molecule(
                            '', appui.getStarDropMoleculeFromMolFile(entry.split('\n')).data()))
                    else:
                        datacol.append(app.Dataset.molecule(str(entry)))
            elif column_type == ColumnInfo.ParseHint.DATETIME:
                if hasattr(config, 'DATE_FORMAT'):
                    datacolumn = app.Dataset.meta(header_name, app.datetime, config.DATE_FORMAT)
                else:
                    datacolumn = app.Dataset.meta(header_name, app.datetime, "%Y-%b-%d")
                for entry in dataset_ordered_dict[h]:
                    datacol.append(app.Dataset.date(entry))
            else:
                datacolumn = app.Dataset.meta(header_name)
                for entry in dataset_ordered_dict[h]:
                    datacol.append(app.Dataset.string(entry))
        dataset.columns.append(datacolumn)
        dataset.cols.append(datacol)
    return dataset
