# NIH_cactus.py - Web service for the NIH NCI CACTUS Chemical Identifier Resolver, to retrieve various chemical
# identifiers for molecular structures.
# Use of this code is subject to the terms of the StarDrop License Agreement and use, copying or redistribution of this
# file for use by other persons or organisations is prohibited
# Copyright(C) Optibrium Ltd 2023
# stardrop-support@optibrium.com
#

import os
import requests
import textwrap
from typing import Any, Optional

import app

import stardrop_helper_functions.dataset_import_export as dimex
import stardrop_helper_functions.logfile_handling as logfile
from stardrop_helper_functions.qt_widgets import DisplayGUIs, PromptGUIs, SpecializedGUIs

version_number_str = '1.1'

# The maximum amount of new cell entries before the user gets warned about the potential runtime
# A quick test showed a runtime of 0.5 seconds per new cell entry
MAX_NEW_CELL_ENTRIES_WARNING = 60

# The base URL for the NIH NCI CACTUS database
BASE_URL = 'https://cactus.nci.nih.gov/chemical/structure'

# A dictionary which acts as a mapping of the chemical identifiers to be selected by the user to the
# corresponding strings used by the NIH NCI CACTUS database to refer to those chemical identifiers
identifier_API_words = \
    {'Chemical Formula'    : 'formula',      # noqa: E203
     'Standard InChlKey'   : 'stdinchikey',  # noqa: E203
     'Standard InChl'      : 'stdinchi',     # noqa: E203
     'CAS Registry Number' : 'cas',          # noqa: E203
     'IUPAC Name'          : 'iupac_name'}   # noqa: E203


def select_identifiers_SMILES_and_rows(appui, dataset: dict[str, Any]) -> tuple[Optional[list[str]],
                                                                                Optional[list[str]],
                                                                                Optional[list[str]],
                                                                                int,
                                                                                bool]:
    """
    Retrieve the SMILES and ID columns from the StarDrop dataset, along with the index
    of the SMILES column. Will throw an exception if no SMILES columns are detected.
    @param: dataset: A StarDrop dataset imported using dimex.import_stardrop_dataset()
    @returns: The ID and SMILES columns, along with the index of the SMILES column
    """

    selected_smiles = None
    selected_rows = None
    num_new_cell_entries = None
    continue_with_script = False

    selected_identifiers = \
        PromptGUIs.select_items_from_list('Choose chemical identifiers',
                                          'Select the chemical identifers to be resolved',
                                          list(identifier_API_words.keys()),
                                          True)

    if selected_identifiers:

        logfile.record_info('User selected the following identifiers: {}'.format(', '.join(['\'' + identifier + '\'' for identifier in selected_identifiers])))  # noqa: E501

        molecular_column_names = [column_name for column_name in dataset['column type'] if dataset['column type'][column_name] == app.mol]  # noqa: E501

        if molecular_column_names:

            molecular_column_name = molecular_column_names[0]

            if len(molecular_column_names) > 1:
                logfile.record_warning(f'Multiple SMILES columns were detected, choosing only the first one: {molecular_column_name}')  # noqa: E501

            smiles_column_entries = dataset['column'][molecular_column_name]

            all_rows = list(range(len(smiles_column_entries)))
            highlighted_rows = list(appui.getSelectedRows(dataset['name']))

            id_column_entries = dataset['column'][dataset['id column name']] if dataset['id column name'] else [f'row #{row + 1:1d}' for row in all_rows]  # noqa: E501

            selected_smiles = list(zip(all_rows, id_column_entries, smiles_column_entries))
            selected_rows = all_rows if len(highlighted_rows) == 0 else highlighted_rows

            num_new_cell_entries = len(selected_rows)*len(selected_identifiers)

            continue_with_script = \
                PromptGUIs.select_continuation_status(textwrap.fill(' '.join([f'{num_new_cell_entries:1d} new cell entries were detected for the resolution of',     # noqa: E501
                                                                              'chemical identifiers. Communication with the NIH NCI CACTUS Webservice may cause',    # noqa: E501
                                                                              'this script to run slowly with so many new cell entries. Would you like to',          # noqa: E501
                                                                              'continue executing this script anyways?']),                                           # noqa: E501
                                                                    width=100),
                                                      'Chemical Identifier Resolver Continuation') if num_new_cell_entries > MAX_NEW_CELL_ENTRIES_WARNING else True  # noqa: E501

            if not continue_with_script:
                logfile.record_error(f'User chose to not continue resolving chemical identifiers due to too many rows: {len(selected_rows):1d}')  # noqa: E501

        else:

            err_msg = 'No Structure columns were detected'
            logfile.record_error(err_msg)
            DisplayGUIs.warning_window(err_msg,
                                       'No columns')

    else:
        logfile.record_error('User did not select any chemical identifiers to be resolved')

    return [selected_identifiers, selected_smiles, selected_rows, num_new_cell_entries, continue_with_script]


def resolve_smiles_into_identifier(molecule_id: str,
                                   smiles: str,
                                   identifier: str,
                                   identifer_API_word: str) -> Optional[str]:
    """
    Retrieve the requested chemical identifier for the given molecule (i.e., SMILES string). Will
    return "None" if the attempt to retrieve the identifer from the database was unsuccessful. Will
    also return "None" if the "smiles" variable is also None-valued
    @param: molecule_id: The ID for the molecule (i.e., the SMILES string) from the dataset's ID column
    @param: smiles: The SMILES string of the molecule
    @param: identifier: The chemical identifer as selected by the user
    @param: identifier_api_word: The corresponding text used by the NIH NCI CACTUS database to refer
                                 to the requested chemical identifier
    @returns: The chemical identifier retrieved from the NIH NCI CACTUS database
    """

    retrieved_identifier = None

    if smiles:

        url_response = requests.get('/'.join([BASE_URL, requests.utils.quote(smiles), identifer_API_word]))
        status_code = url_response.status_code

        if status_code == 200:
            retrieved_identifier = url_response.text
        elif status_code == 404:
            logfile.record_error(f"The NIH NCI CACTUS Server did not have '{identifier}' for '{molecule_id}'")
        elif status_code == 500:
            logfile.record_error(f"Unexpected error while retrieving '{identifier}' for '{molecule_id}'")
        else:
            logfile.record_error(f"Unexpected HTTPS status code while retrieving '{identifier}' for '{molecule_id}': {status_code:3d}")  # noqa: E501

        if retrieved_identifier:
            if '=' in retrieved_identifier:
                retrieved_identifier = retrieved_identifier.split('=')[1]

    else:

        retrieved_identifier = None
        logfile.record_warning(f"The structure for '{molecule_id}' is empty, could not retrieve '{identifier}'")  # noqa: E501

    return retrieved_identifier


def retrieve_NIH_CACTUS_data(appui,
                             dataset: dict):
    """
    Prompt the user to select the chemical identifiers they would like (as well as which
    SMILES column to get the molecules from, if there's more than one), and then export
    the retrieved chemical identifiers from the NIH NCI CACTUS database into StarDrop as columns.
    If an identifer for a given molecule could not be retrieved, then a greyed-out error message
    will instead be displayed, and if no chemical identifiers could be retrieved for any of
    them, then the column for that chemical identifier will not be exported.
    @param: appui: The StarDrop GUI
    @param: dataset: A StarDrop dataset imported using dimex.import_stardrop_dataset()
    """

    [selected_identifiers,
     selected_smiles,
     selected_rows,
     num_new_cell_entries,
     continue_with_script] = \
        select_identifiers_SMILES_and_rows(appui, dataset)

    if continue_with_script:

        [progress_iterator,
         progDlg] = \
            SpecializedGUIs.initialize_progress_window('Data Retrieval Status',
                                                       'Running NIH NCI CACTUS Data Retrieval',
                                                       num_new_cell_entries)

        progDlg.show()

        successfully_retrieved_column_names = []
        empty_identifier_columns = []

        try:

            num_old_columns = len(dataset['column index'])

            for identifier in selected_identifiers:

                retrieved_identifiers = []

                for row, molecule_id, smiles in selected_smiles:

                    if row in selected_rows:

                        retrieved_identifier = \
                            resolve_smiles_into_identifier(molecule_id,
                                                           smiles,
                                                           identifier,
                                                           identifier_API_words[identifier])

                        progress_iterator += 1
                        progDlg.setProgress(progress_iterator)

                    else:

                        retrieved_identifier = ''

                    retrieved_identifiers.append(retrieved_identifier)

                if any([row_specific_identifier not in [None, ''] for row_specific_identifier in retrieved_identifiers]):  # noqa: E501

                    new_col_idx = dataset['column index'][successfully_retrieved_column_names[-1]] + 1 if successfully_retrieved_column_names else num_old_columns  # noqa: E501

                    successfully_retrieved_column_names.append(identifier)

                    dataset['column'       ][identifier] = retrieved_identifiers  # noqa: E202
                    dataset['column type'  ][identifier] = app.string             # noqa: E202
                    dataset['column index' ][identifier] = new_col_idx            # noqa: E202
                    dataset['column format'][identifier] = None

                else:

                    logfile.record_error(f'The \'{identifier}\' column was not exported because there was no successful resolution of this identifier for any compound.')  # noqa: E501
                    empty_identifier_columns.append(identifier)

        except Exception as unexpected_error:

            generic_err_msg = 'Had to cancel identity resolution because of an unexpected error'
            logfile.record_error(generic_err_msg + ': ' + str(unexpected_error))
            DisplayGUIs.warning_window(generic_err_msg + ', please see the logfile for more details',
                                       'Unexpected error')

        progDlg.hide()
        progDlg.delete()

        if successfully_retrieved_column_names:
            dimex.export_stardrop_column(appui,
                                         dataset,
                                         successfully_retrieved_column_names,
                                         invalid_entry_msg='Identifier not found')
        else:
            logfile.record_error('No columns could be exported from the NIH Cactus website')

        if empty_identifier_columns:
            DisplayGUIs.warning_window('The following identifier(s) returned no data for the submitted compound(s):\n\n' + '\n'.join(empty_identifier_columns),  # noqa: E501
                                       'Failure to resolve identifiers')


def run_chemical_identifier_resolver(appui):

    logfile.initialize_logfile(os.path.basename(os.path.splitext(os.path.realpath(__file__))[0]).replace('_', ' ').title(),  # noqa: E501
                               version_number_str,
                               os.getcwd(),
                               'chemical_identifier_resolver')

    try:

        dataset = \
            dimex.import_stardrop_dataset(appui,
                                          preferred_mol_str_type='SMILES')

        if dataset:
            retrieve_NIH_CACTUS_data(appui, dataset)

    except Exception as err:
        logfile.record_and_raise_exception(err)

    logfile.finish_logfile()


def get_stardrop_definitions():

    definitions = \
        [{'script_name': 'Chemical Identifier Resolver',
          'callback': run_chemical_identifier_resolver}]

    return definitions
