18-07

2023-07-18 09:16:38 +01:00 · 2023-07-18 09:16:38 +01:00 · 2370686d72
commit 2370686d72
parent 37b2bbce98
24 changed files with 18 additions and 1008 deletions
--- a/pycache/init.cpython-311.pyc
+++ b/pycache/init.cpython-311.pyc
--- a/pycache/settings.cpython-311.pyc
+++ b/pycache/settings.cpython-311.pyc
--- a/pycache/settings_v1.cpython-311.pyc
+++ b/pycache/settings_v1.cpython-311.pyc
--- a/io/parsers/pycache/init.cpython-311.pyc
+++ b/io/parsers/pycache/init.cpython-311.pyc
--- a/io/parsers/pycache/excel.cpython-311.pyc
+++ b/io/parsers/pycache/excel.cpython-311.pyc
--- a/io/parsers/mirri_excel.py
+++ b/io/parsers/mirri_excel.py
@ -19,12 +19,6 @@ from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO,
                            NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION,
                            ONLY_RESEARCH, ONTOBIOTOPE,
                            PUBLICATION_FIELDS, STRAINS, SUBTAXAS)
 from mirri.settings_v1 import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO,
                            GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS,
                            MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS,
                            NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION,
                            ONLY_RESEARCH, ONTOBIOTOPE,
                            PUBLICATION_FIELDS, STRAINS, SUBTAXAS)
 from mirri.utils import get_country_from_name
 RESTRICTION_USE_TRANSLATOR = {
@ -44,34 +38,12 @@ TRUEFALSE_TRANSLATOR = {
 def parse_mirri_excel(fhand, version=""):
-    if version == "20200602":
+    if version == "5.1.2":
        return _parse_mirri_v20200601(fhand)
    elif version == "12052023":
        return _parse_mirri_v12052023(fhand)            
    else:
-        raise NotImplementedError("Only versions 20200601 and 12052023 are implemented")
+        raise NotImplementedError("Only version is 5.1.2 implemented")
 def _parse_mirri_v20200601(fhand):
    fhand.seek(0)
    file_content = BytesIO(fhand.read())
    wb = load_workbook(filename=file_content, read_only=True, data_only=True)
    locations = workbook_sheet_reader(wb, LOCATIONS)
    ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE)
    growth_media = list(parse_growth_media(wb))
    markers = workbook_sheet_reader(wb, GENOMIC_INFO)
    publications = list(parse_publications(wb))
    strains = parse_strains(wb, locations=locations,  growth_media=growth_media,
                            markers=markers, publications=publications,
                            ontobiotopes=ontobiotopes)
    return {"strains": strains, "growth_media": growth_media}
 def _parse_mirri_v12052023(fhand):
    fhand.seek(0)
    file_content = BytesIO(fhand.read())
--- a/io/writers/mirri_excel.py
+++ b/io/writers/mirri_excel.py
@ -5,7 +5,6 @@ from openpyxl.workbook.workbook import Workbook
 from mirri import rgetattr
 from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS
 from mirri.settings_v1 import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS
 from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR
 INITIAL_SEXUAL_STATES = [
@ -51,81 +50,9 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS]
 def write_mirri_excel(path, strains, growth_media, version):
-    if version == "20200601":
+    if version == "5.1.2":
        _write_mirri_excel_20200601(path, strains, growth_media)
    if version == "12052023":
        _write_mirri_excel_12052023(path, strains, growth_media)
 def _write_mirri_excel_20200601(path, strains, growth_media):
    wb = Workbook()
    write_markers_sheet(wb)
    ontobiotope_path = DATA_DIR / "ontobiotopes.csv"
    write_ontobiotopes(wb, ontobiotope_path)
    write_growth_media(wb, growth_media)
    growth_media_indexes = [str(gm.acronym) for gm in growth_media]
    locations = {}
    publications = {}
    sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES))
    genomic_markers = {}
    strains_data = _deserialize_strains(strains, locations, growth_media_indexes,
                                        publications, sexual_states, genomic_markers)
    strains_data = list(strains_data)
    # write strain to generate indexed data
    strain_sheet = wb.create_sheet("Strains")
    strain_sheet.append([field["label"] for field in MIRRI_FIELDS])
    for strain_row in strains_data:
        strain_sheet.append(strain_row)
    redimension_cell_width(strain_sheet)
    # write locations
    loc_sheet = wb.create_sheet("Geographic origin")
    loc_sheet.append(["ID", "Country", "Region", "City", "Locality"])
    for index, loc_index in enumerate(locations.keys()):
        location = locations[loc_index]
        row = [index, location.country, location.state, location.municipality,
               loc_index]
        loc_sheet.append(row)
    redimension_cell_width(loc_sheet)
    # write publications
    pub_sheet = wb.create_sheet("Literature")
    pub_sheet.append(PUB_HEADERS)
    for publication in publications.values():
        row = []
        for pub_field in PUBLICATION_FIELDS:
            # if pub_field['attribute'] == 'id':
            #     value = index
            value = getattr(publication, pub_field['attribute'], None)
            row.append(value)
        pub_sheet.append(row)
    redimension_cell_width(pub_sheet)
    # write sexual states
    sex_sheet = wb.create_sheet("Sexual state")
    for sex_state in sorted(list(sexual_states)):
        sex_sheet.append([sex_state])
    redimension_cell_width(sex_sheet)
    # write genetic markers
    markers_sheet = wb.create_sheet("Genomic information")
    markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence'])
    for strain_id, markers in genomic_markers.items():
        for marker in markers:
            row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq]
            markers_sheet.append(row)
    redimension_cell_width(markers_sheet)
    del wb["Sheet"]
    wb.save(str(path))
 def _write_mirri_excel_12052023(path, strains, growth_media):
    wb = Workbook()
--- a/settings_v1.py
+++ b/settings_v1.py
@ -1,311 +0,0 @@
 from pathlib import Path
 from charset_normalizer import VERSION
 DATA_DIR = Path(__file__).parent / "data"
 ACCESSION_NUMBER = "accession_number"
 RESTRICTION_ON_USE = "restriction_on_use"
 NAGOYA_PROTOCOL = "nagoya_protocol"
 ABS_RELATED_FILES = "abs_related_files"
 MTA_FILES = "mta_file"
 OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers"
 STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection"
 RISK_GROUP = "risk_group"
 DUAL_USE = "dual_use"
 QUARANTINE = "quarantine"
 ORGANISM_TYPE = "organism_type"
 TAXON_NAME = "taxon_name"
 INFRASUBSPECIFIC_NAME = "infrasubspecific_names"
 COMMENTS_ON_TAXONOMY = "comments_on_taxonomy"
 STATUS = "status"
 HISTORY_OF_DEPOSIT = "history_of_deposit"
 DEPOSITOR = "depositor"
 DATE_OF_DEPOSIT = "date_of_deposit"
 COLLECTED_BY = "collected_by"
 DATE_OF_COLLECTION = "date_of_collection"
 ISOLATED_BY = "isolated_by"
 DATE_OF_ISOLATION = "date_of_isolation"
 DATE_OF_INCLUSION = "date_of_inclusion_on_catalog"
 TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range"
 RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature"
 RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth"
 FORM_OF_SUPPLY = "form_of_supply"
 GEO_COORDS = "coordinates_of_geographic_origin"
 ACCESSION_NAME = "other_denomination"
 ALTITUDE = "altitude_of_geographic_origin"
 GEOGRAPHIC_ORIGIN = "geographic_origin"
 GMO = "gmo"
 GMO_CONSTRUCTION_INFO = "gmo_construction_information"
 MUTANT_INFORMATION = "mutant_information"
 GENOTYPE = "genotype"
 LITERATURE = "literature"
 SEXUAL_STATE = "sexual_state"
 PLOIDY = "ploidy"
 INTERSPECIFIC_HYBRID = "interspecific_hybrid"
 HYBRIDS = 'hybrids'
 PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code"
 PATHOGENICITY = "pathogenicity"
 ENZYME_PRODUCTION = "enzyme_production"
 PRODUCTION_OF_METABOLITES = "production_of_metabolites"
 APPLICATIONS = "applications"
 REMARKS = "remarks"
 PLASMIDS = "plasmids"
 PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields"
 SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation"
 ISOLATION_HABITAT = "isolation_habitat"
 ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat"
 LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome"
 # StrainId
 STRAIN_ID = "id"
 COLLECTION_CODE = "collection_code"
 STRAIN_PUI = "strain_pui"
 STRAIN_URL = "strain_url"
 ID_SYNONYMS = 'id_synonyms'
 # Taxonomy
 GENUS = "genus"
 SPECIES = "species"
 # Location
 COUNTRY = "countryOfOriginCode"
 SITE = "site"
 STATE = "state"
 PROVINCE = "province"
 MUNICIPALITY = "municipality"
 ISLAND = "island"
 OTHER = "other"
 LATITUDE = "latitude"
 LONGITUDE = "longitude"
 ALTITUDE = "altitude"
 GEOREF_METHOD = "georeferencingMethod"
 COORDUNCERTAINTY = "coordUncertainty"
 COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference"
 LOCATION = "location"
 ALLOWED_COLLECTING_SITE_KEYS = [
    COUNTRY,
    STATE,
    PROVINCE,
    ISLAND,
    MUNICIPALITY,
    OTHER,
    SITE,
    LATITUDE,
    LONGITUDE,
    ALTITUDE,
    GEOREF_METHOD,
    COORDUNCERTAINTY,
    COORD_SPATIAL_REFERENCE,
 ]
 MIRRI_FIELDS = [
    {"attribute": "id", "label": "Accession number"},
    {"attribute": "restriction_on_use", "label": "Restrictions on use"},
    {"attribute": "nagoya_protocol",
        "label": "Nagoya protocol restrictions and compliance conditions"},
    {"attribute": ABS_RELATED_FILES, "label": "ABS related files"},
    {"attribute": "mta_files", "label": "MTA file"},
    {"attribute": "other_numbers", "label": "Other culture collection numbers"},
    {"attribute": "is_from_registered_collection",
        "label": "Strain from a registered collection"},
    {"attribute": "risk_group", "label": "Risk Group"},
    {"attribute": "is_potentially_harmful", "label": "Dual use"},
    {"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"},
    {"attribute": "taxonomy.organism_type", "label": "Organism type"},
    {"attribute": "taxonomy.taxon_name", "label": "Taxon name"},
    {"attribute": "taxonomy.infrasubspecific_name",
        "label": "Infrasubspecific names"},
    {"attribute": "taxonomy.comments", "label": "Comment on taxonomy"},
    {"attribute": "taxonomy.interspecific_hybrid",
        "label": "Interspecific hybrid"},
    {"attribute": "status", "label": "Status"},
    {"attribute": "history", "label": "History of deposit", },
    {"attribute": "deposit.who", "label": "Depositor"},
    {"attribute": "deposit.date", "label": "Date of deposit"},
    {"attribute": "catalog_inclusion_date",
        "label": "Date of inclusion in the catalogue"},
    {"attribute": "collect.who", "label": "Collected by"},
    {"attribute": "collect.date", "label": "Date of collection"},
    {"attribute": "isolation.who", "label": "Isolated by"},
    {"attribute": "isolation.date", "label": "Date of isolation"},
    {"attribute": "isolation.substrate_host_of_isolation",
        "label": "Substrate/host of isolation"},
    {"attribute": "growth.tested_temp_range",
        "label": "Tested temperature growth range"},
    {"attribute": "growth.recommended_temp",
        "label": "Recommended growth temperature"},
    {"attribute": "growth.recommended_media",
        "label": "Recommended medium for growth"},
    {"attribute": "form_of_supply", "label": "Form of supply"},
    {"attribute": "other_denominations", "label": "Other denomination"},
    {"attribute": "collect.location.coords",
        "label": "Coordinates of geographic origin"},
    {"attribute": "collect.location.altitude",
        "label": "Altitude of geographic origin"},
    {"attribute": "collect.location", "label": "Geographic origin"},
    {"attribute": "collect.habitat", "label": "Isolation habitat"},
    {"attribute": "collect.habitat_ontobiotope",
        "label": "Ontobiotope term for the isolation habitat"},
    {"attribute": "genetics.gmo", "label": "GMO"},
    {"attribute": "genetics.gmo_construction",
        "label": "GMO construction information"},
    {"attribute": "genetics.mutant_info", "label": "Mutant information"},
    {"attribute": "genetics.genotype", "label": "Genotype"},
    {"attribute": "genetics.sexual_state", "label": "Sexual state"},
    {"attribute": "genetics.ploidy", "label": "Ploidy"},
    {"attribute": "genetics.plasmids", "label": "Plasmids"},
    {"attribute": "genetics.plasmids_in_collections",
        "label": "Plasmids collections fields"},
    {"attribute": "publications", "label": "Literature"},
    {"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"},
    {"attribute": "pathogenicity", "label": "Pathogenicity"},
    {"attribute": "enzyme_production", "label": "Enzyme production"},
    {"attribute": "production_of_metabolites",
        "label": "Production of metabolites"},
    {"attribute": "applications", "label": "Applications", },
    {"attribute": "remarks", "label": "Remarks"},
    {"attribute": LITERATURE_LINKED_TO_SEQ_GENOME,
        "label": "Literature linked to the sequence/genome"},
 ]
 ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma",
                   'forma.specialis']
 ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA
 # nagoya
 NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol"
 NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection"
 NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection"
 ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS,
                          NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE]
 # Use restriction
 NO_RESTRICTION = "no_restriction"
 ONLY_RESEARCH = "only_research"
 COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement"
 ALLOWED_RESTRICTION_USE_OPTIONS = [
    NO_RESTRICTION,
    ONLY_RESEARCH,
    COMMERCIAL_USE_WITH_AGREEMENT,
 ]
 ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"]
 AGAR = "Agar"
 CRYO = "Cryo"
 DRY_ICE = "Dry Ice"
 LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium"
 LYO = "Lyo"
 OIL = "Oil"
 WATER = "Water"
 ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE,
                           LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER]
 DEPOSIT = "deposit"
 ISOLATION = "isolation"
 COLLECT = "collect"
 GROWTH = "growth"
 GENETICS = "genetics"
 TAXONOMY = "taxonomy"
 # Markers
 MARKERS = "markers"
 MARKER_TYPE = "marker_type"
 MARKER_INSDC = "INSDC"
 MARKER_SEQ = "marker_seq"
 ALLOWED_MARKER_TYPES = [
    {"acronym": "16S rRNA", "marker": "16S rRNA"},
    {"acronym": "ACT", "marker": "Actin"},
    {"acronym": "CaM", "marker": "Calmodulin"},
    {"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"},
    {"acronym": "ITS",
        "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"},
    {"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"},
    {"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"},
    {"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"},
    {"acronym": "TUBB", "marker": "β-Tubulin"},
 ]
 PUBLICATIONS = "publications"
 PUB_ID = "id"
 PUB_DOI = "pub_doi"
 PUB_PUBMED_ID = ''
 PUB_FULL_REFERENCE = "full_reference"
 PUB_TITLE = "title"
 PUB_AUTHORS = "authors"
 PUB_JOURNAL = "journal"
 PUB_YEAR = "year"
 PUB_VOLUME = "volume"
 PUB_ISSUE = "issue"
 PUB_FIRST_PAGE = "first_page"
 PUB_LAST_PAGE = "last_page"
 BOOK_TITLE = "book_title"
 BOOK_EDITOR = "book_editor"
 BOOK_PUBLISHER = "book_publisher"
 PUBLICATION_FIELDS = [
    {"label": "ID", "attribute": PUB_ID},
    {"label": "Full reference", "attribute": PUB_FULL_REFERENCE},
    {"label": "Authors", "attribute": PUB_AUTHORS},
    {"label": "Title", "attribute": PUB_TITLE},
    {"label": "Journal", "attribute": PUB_JOURNAL},
    {"label": "Year", "attribute": PUB_YEAR},
    {"label": "Volume", "attribute": PUB_VOLUME},
    {"label": "Issue", "attribute": PUB_ISSUE},
    {"label": "First page", "attribute": PUB_FIRST_PAGE},
    {"label": "Last page", "attribute": PUB_FIRST_PAGE},
    {"label": "Book title", "attribute": BOOK_TITLE},
    {"label": "Editors", "attribute": BOOK_EDITOR},
    {"label": "Publisher", "attribute": BOOK_PUBLISHER},
 ]
 # ploidy
 ANEUPLOID = 0
 HAPLOID = 1
 DIPLOID = 2
 TRIPLOID = 3
 TETRAPLOID = 4
 POLYPLOID = 9
 ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID,
                    POLYPLOID]
 SUBTAXAS = {
    "subsp.": "subspecies",
    "var.": "variety",
    "convar.": "convarietas",
    "group.": "group",
    "f.": "forma",
    "f.sp.": "forma.specialis"
 }
 #Control
 VERSION = "Version"
 DATE = "Date"
 #Controle files
 CONTROL_FIELDS = [
    {"label": "Version", "attribute": VERSION},
    {"label": "Date", "attribute": DATE},
 ]
 # Excel sheet name
 LOCATIONS = "Geographic origin"  # 'Locations'
 GROWTH_MEDIA = "Growth media"
 GENOMIC_INFO = "Genomic information"
 STRAINS = "Strains"
 LITERATURE_SHEET = "Literature"
 SEXUAL_STATE_SHEET = "Sexual state"
 RESOURCE_TYPES_VALUES = "Resource types values"
 FORM_OF_SUPPLY_SHEET = "Forms of supply"
 PLOIDY_SHEET = "Ploidy"
 ONTOBIOTOPE = "Ontobiotope"
 MARKERS = "Markers"
 CONTROL_SHEET = "Version"
--- a/validation/2B90F320
+++ b/validation/2B90F320
--- a/validation/B3F84180
+++ b/validation/B3F84180
--- a/validation/pycache/init.cpython-311.pyc
+++ b/validation/pycache/init.cpython-311.pyc
--- a/validation/pycache/excel_validator.cpython-311.pyc
+++ b/validation/pycache/excel_validator.cpython-311.pyc
--- a/validation/pycache/tags.cpython-311.pyc
+++ b/validation/pycache/tags.cpython-311.pyc
--- a/validation/pycache/validate_v5.cpython-311.pyc
+++ b/validation/pycache/validate_v5.cpython-311.pyc
--- a/validation/pycache/validation_conf_12052023.cpython-311.pyc
+++ b/validation/pycache/validation_conf_12052023.cpython-311.pyc
--- a/validation/pycache/validation_conf_20200601.cpython-311.pyc
+++ b/validation/pycache/validation_conf_20200601.cpython-311.pyc
--- a/validation/error_logging/pycache/init.cpython-311.pyc
+++ b/validation/error_logging/pycache/init.cpython-311.pyc
--- a/validation/error_logging/pycache/error.cpython-311.pyc
+++ b/validation/error_logging/pycache/error.cpython-311.pyc
--- a/validation/error_logging/pycache/error_log.cpython-311.pyc
+++ b/validation/error_logging/pycache/error_log.cpython-311.pyc
--- a/validation/error_logging/pycache/error_message.cpython-311.pyc
+++ b/validation/error_logging/pycache/error_message.cpython-311.pyc
--- a/validation/excel_validator.py
+++ b/validation/excel_validator.py
@ -16,20 +16,19 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS
                                   TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE,
                                   HISTORY,NAGOYA1, VERSION)
 from mirri.settings import LOCATIONS, SUBTAXAS
 from mirri.settings_v1 import LOCATIONS, SUBTAXAS
 from mirri.validation.validation_conf_12052023 import version_config
 from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF
 def validate_mirri_excel(fhand, version= "5.1.2" ):
    if  version == "5.1.2":
        configuration = MIRRI_12052023_VALLIDATION_CONF            
    else:
        raise NotImplementedError("Only version 5.1.2 is implemented")
 def validate_mirri_excel(fhand, version="", date=""):
    configuration = version_config.get(version)
    if configuration is None:
        raise NotImplementedError("Unsupported version: " + version)
    configuration["date"] = date or configuration.get("date")  
    if configuration["date"] != "12/05/2023":
        raise ValueError("Invalid date. Expected: 12/05/2023")
    return validate_excel(fhand, configuration)
 def version(value , validation_conf=None):
    if value is None:
        return True
@ -210,8 +209,6 @@ def validate_row(row, validation_steps, in_memory_sheets):
        kind = validation_step[TYPE]
        error_code = validation_step[ERROR_CODE]
        if kind == NAGOYA:
            if not is_valid_nagoya_v20200601(row, in_memory_sheets):
                return error_code
            if not is_valid_nagoya_v12052023(row, in_memory_sheets):
                return error_code
        elif kind == BIBLIO:
@ -281,39 +278,10 @@ def is_valid_nago(row):
    return True
 def parsee_mirri_excel(row, in_memory_sheets, version=""):
-    if version == "20200601":
+    if version == "12052023":
        return is_valid_nagoya_v20200601 (row, in_memory_sheets)
    elif version == "12052023":
        return is_valid_nagoya_v12052023 (row, in_memory_sheets)            
    else:
-        raise NotImplementedError("Only versions 20200601 and 12052023 are implemented")
+        raise NotImplementedError("Only version is implemented")
 def is_valid_nagoya_v20200601(row, in_memory_sheets):  # sourcery skip: return-identity
    location_index = row.get('Geographic origin', None)
    if location_index is None:
        country = None
    else:
        geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {})
        country = geo_origin.get('Country', None)
    _date = row.get("Date of collection", None)
    if _date is None:
        _date = row.get("Date of isolation", None)
    if _date is None:
        _date = row.get("Date of deposit", None)
    if _date is None:
        _date = row.get("Date of inclusion in the catalogue", None)
    if _date is not None:
        year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4])
    else:
        year = None
    if year is not None and year >= 2014 and country is None:
        return False
    return True
 def is_valid_nagoya_v12052023(row, in_memory_sheets):  # sourcery skip: return-identity
    location_index = row.get('geographicOrigin', None)
--- a/validation/validate_v5.py
+++ b/validation/validate_v5.py
@ -9,10 +9,9 @@ from mirri.validation.excel_validator import validate_mirri_excel
 def main():
    path = Path(sys.argv[1])
    version = str(sys.argv[2])
    date = str(sys.argv[3])
    try:
-        error_log = validate_mirri_excel(path.open("rb"), version=version, date=date)
+        error_log = validate_mirri_excel(path.open("rb"), version=version)
    except NotImplementedError as e:
        print(e)    
--- a/validation/validation_conf_12052023.py
+++ b/validation/validation_conf_12052023.py
@ -4,7 +4,7 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS
                                   UNIQUE,VERSION,
                                   VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1)
 from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
-                            STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS,  CONTROL_SHEET,)
+                            STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS,  CONTROL_SHEET)
@ -323,7 +323,7 @@ STRAIN_FIELDS = [
    {
        FIELD: "plasmidCollections",
         VALIDATION: [
-            {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
+            {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
             ERROR_CODE: "STD62"}
         ]
    },
@ -358,7 +358,7 @@ STRAIN_FIELDS = [
    {
        FIELD: "sequenceLiterature",
        VALIDATION: [
-             {TYPE: REGEXP, MATCH: "^\d+(;?\s*\d+)*$", ERROR_CODE: "STD61"},
+             {TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"},
        ]
    },
--- a/validation/validation_conf_20200601.py
+++ b/validation/validation_conf_20200601.py
@ -1,545 +0,0 @@
 from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
                                   ERROR_CODE, FIELD, MANDATORY, MATCH,
                                   MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
                                   UNIQUE,
                                   VALIDATION, VALUES, BIBLIO)
 from mirri.settings_v1 import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
                            STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS)
 # GEOGRAPHIC_ORIGIN
 # SEXUAL_STATE_SHEET,
 # RESOURCE_TYPES_VALUES,
 # FORM_OF_SUPPLY_SHEET,
 # PLOIDY_SHEET)
 STRAIN_FIELDS = [
    {
        FIELD: "Accession number",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: 'STD01'},
            {TYPE: UNIQUE, ERROR_CODE: 'STD03'},
            {TYPE: MISSING, ERROR_CODE: "STD02"},
            {TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"}
        ]
    },
    {
        FIELD: "Restrictions on use",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD05"},
            {TYPE: MISSING, ERROR_CODE: "STD06"},
            {TYPE: CHOICES, VALUES: ["1", "2", "3"],
             MULTIPLE: False, ERROR_CODE: "STD07"}
        ]
    },
    {
        FIELD: "Nagoya protocol restrictions and compliance conditions",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD08"},
            {TYPE: MISSING, ERROR_CODE: "STD09"},
            {TYPE: CHOICES, VALUES: ["1", "2", "3"],
             MULTIPLE: False, ERROR_CODE: "STD10"}
        ]
    },
    {
        FIELD: "ABS related files",
        VALIDATION: [],
    },
    {
        FIELD: "MTA file",
        VALIDATION: [],
    },
    {
        FIELD: "Other culture collection numbers",
        # VALIDATION: [
        #     {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
        #      MULTIPLE: True, SEPARATOR: ";"}
        # ]
    },
    {
        FIELD: "Strain from a registered collection",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["1", "2"],
             ERROR_CODE: "STD11"}
        ]
    },
    {
        FIELD: "Risk Group",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD12"},
            {TYPE: MISSING, ERROR_CODE: "STD13"},
            {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"],
             MULTIPLE: False, ERROR_CODE: "STD14"}
        ]
    },
    {
        FIELD: "Dual use",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["1", "2"],
             ERROR_CODE: "STD15"}
        ]
    },
    {
        FIELD: "Quarantine in Europe",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["1", "2"],
             ERROR_CODE: "STD16"}
        ]
    },
    {
        FIELD: "Organism type",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD17"},
            {TYPE: MISSING, ERROR_CODE: "STD18"},
            {TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
                                     "Cyanobacteria", "Filamentous Fungi",
                                     "Phage", "Plasmid", "Virus", "Yeast",
                                     "1", "2", "3", "4", "5", "6", "7", "8", "9"],
             MULTIPLE: True, SEPARATOR: ";",  ERROR_CODE: "STD19"}
        ]
    },
    {
        FIELD: "Taxon name",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD20"},
            {TYPE: MISSING, ERROR_CODE: "STD21"},
            {TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True,
             SEPARATOR: ';'}
        ]
    },
    {
        FIELD: "Infrasubspecific names",
    },
    {
        FIELD: "Comment on taxonomy",
    },
    {
        FIELD: "Interspecific hybrid",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["1", "2"],
             ERROR_CODE: "STD23"}
        ]
    },
    {
        FIELD: "Status",
    },
    {
        FIELD: "History of deposit",
        VALIDATION: [
            # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24",  # modify the regex
            #  MULTIPLE: True, SEPARATOR: ";"}
        ]
    },
    {
        FIELD: "Depositor"
    },
    {
        FIELD: "Date of deposit",
        VALIDATION: [
            {TYPE: DATE, ERROR_CODE: "STD25"},
        ]
    },
    {
        FIELD: "Date of inclusion in the catalogue",
        VALIDATION: [
            {TYPE: DATE, ERROR_CODE: "STD26"},
        ]
    },
    {
        FIELD: "Collected by",
    },
    {
        FIELD: "Date of collection",
        VALIDATION: [
            {TYPE: DATE, ERROR_CODE: "STD27"},
        ]
    },
    {
        FIELD: "Isolated by",
    },
    {
        FIELD: "Date of isolation",
        VALIDATION: [
            {TYPE: DATE, ERROR_CODE: "STD28"},
        ]
    },
    {
        FIELD: "Substrate/host of isolation",
    },
    {
        FIELD: "Tested temperature growth range",
        VALIDATION: [
            {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
             ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
        ]
    },
    {
        FIELD: "Recommended growth temperature",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD30"},
            {TYPE: MISSING, ERROR_CODE: "STD31"},
            {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
             ERROR_CODE: "STD32",
             MULTIPLE: True, SEPARATOR: ";"}
        ]
    },
    {
        FIELD: "Recommended medium for growth",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD33"},
            {TYPE: MISSING, ERROR_CODE: "STD34"},
            {TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
             MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
        ]
    },
    {
        FIELD: "Form of supply",
        VALIDATION: [
            {TYPE: MANDATORY, ERROR_CODE: "STD36"},
            {TYPE: MISSING, ERROR_CODE: "STD37"},
            {TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium',
                                     'Lyo', 'Oil', 'Water'],
             MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"}
        ]
    },
    {
        FIELD: "Other denomination",
    },
    {
        FIELD: "Coordinates of geographic origin",
        VALIDATION: [
            {TYPE: COORDINATES, ERROR_CODE: "STD39"},
        ]
    },
    {
        FIELD: "Altitude of geographic origin",
        VALIDATION: [
            {TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
        ]
    },
    {
        # value can be in the cell or in another sheet. Don't configure this
        FIELD: "Geographic origin",
    },
    {
        FIELD: "Isolation habitat",
    },
    {
        FIELD: "Ontobiotope term for the isolation habitat",
        VALIDATION: [
            {TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
             MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
        ]
    },
    {
        FIELD: "GMO",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["1", "2"],
             ERROR_CODE: "STD42"}
        ]
    },
    {
        FIELD: "GMO construction information",
    },
    {
        FIELD: "Mutant information",
    },
    {
        FIELD: "Genotype",
    },
    {
        FIELD: "Sexual state",
        VALIDATION: [
            {TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
             ERROR_CODE: "STD43"}
            # {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha",
            #                          "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha",
            #                          "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"],
            #  ERROR_CODE: "STD43"}
        ]
    },
    {
        FIELD: "Ploidy",
        VALIDATION: [
            {TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
             ERROR_CODE: "STD44"}
        ]
    },
    {
        FIELD: "Plasmids",
    },
    {
        FIELD: "Plasmids collections fields",
    },
    {
        # value can be in the cell or in another sheet. Don't configure this
        FIELD: "Literature",
        VALIDATION: [
            {TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
             MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
        ]
    },
    {
        FIELD: "Plant pathogenicity code",
    },
    {
        FIELD: "Pathogenicity",
    },
    {
        FIELD: "Enzyme production",
    },
    {
        FIELD: "Production of metabolites",
    },
    {
        FIELD: "Applications",
    },
    {
        FIELD: "Remarks"
    },
    {
        FIELD: "Literature linked to the sequence/genome",
    },
 ]
 SHEETS_SCHEMA = {
    LOCATIONS: {
        "acronym": "GOD",
        "id_field": "ID",
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"},
        COLUMNS: [
            {
                FIELD: "ID",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GOD01"},
                    {TYPE: MISSING, ERROR_CODE: "GOD02"},
                ]
            },
            {
                FIELD: "Country",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GOD03"},
                    {TYPE: MISSING, ERROR_CODE: "GOD04"}
                ]
            },
            {
                FIELD: "Region",
                VALIDATION: []
            },
            {
                FIELD: "City",
                VALIDATION: []
            },
            {
                FIELD: "Locality",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GOD06"},
                    {TYPE: MISSING, ERROR_CODE: "GOD07"}
                ]
            }
        ],
    },
    GROWTH_MEDIA: {
        "acronym": "GMD",
        "id_field": "Acronym",
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"},
        COLUMNS: [
            {
                FIELD: "Acronym",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GMD01"},
                    {TYPE: MISSING, ERROR_CODE: "GMD02"}
                ]
            },
            {
                FIELD: "Description",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GMD03"},
                    {TYPE: MISSING, ERROR_CODE: "GMD04"}
                ]
            },
            {
                FIELD: "Full description",
                VALIDATION: []
            },
        ],
    },
    GENOMIC_INFO: {
        "acronym": "GID",
        "id_field": "Strain AN",
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"},
        COLUMNS: [
            {
                FIELD: "Strain AN",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GID01"},
                    {TYPE: MISSING, ERROR_CODE: "GID02"},
                    {TYPE: CROSSREF, CROSSREF_NAME: "Strains",
                     ERROR_CODE: "GID03"},
                ]
            },
            {
                FIELD: "Marker",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GID04"},
                    {TYPE: MISSING, ERROR_CODE: "GID05"},
                    {TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"}
                ]
            },
            {
                FIELD: "INSDC AN",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "GID07"},
                    {TYPE: MISSING, ERROR_CODE: "GID08"},
                ]
            },
            {
                FIELD: "Sequence",
                VALIDATION: []
            },
        ],
    },
    STRAINS: {
        "acronym": "STD",
        'id_field': 'Accession number',
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
        ROW_VALIDATION: [
            {TYPE: NAGOYA, ERROR_CODE: "STD46"},
        ],
        COLUMNS: STRAIN_FIELDS,
    },
    LITERATURE_SHEET: {
        "acronym": "LID",
        'id_field': 'ID',
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"},
        ROW_VALIDATION: [
            {TYPE: BIBLIO, ERROR_CODE: 'LID17'}
        ],
        COLUMNS: [
            {
                FIELD: "ID",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID01"},
                    {TYPE: MISSING, ERROR_CODE: "LID02"},
                ]
            },
            {
                FIELD: "Full reference",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID03"},
                ]
            },
            {
                FIELD: "Authors",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID05"},
                ]
            },
            {
                FIELD: "Title",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID07"},
                ]
            },
            {
                FIELD: "Journal",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID09"},
                ]
            },
            {
                FIELD: "Year",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID11"},
                ]
            },
            {
                FIELD: "Volume",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID13"},
                ]
            },
            {
                FIELD: "Issue",
                VALIDATION: []
            },
            {
                FIELD: "First page",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "LID15"},
                    {TYPE: MISSING, ERROR_CODE: "LID16"},
                ]
            },
            {
                FIELD: "Last page",
                VALIDATION: []
            },
            {
                FIELD: "Book title",
                VALIDATION: []
            },
            {
                FIELD: "Editors",
                VALIDATION: []
            },
            {
                FIELD: "Publisher",
                VALIDATION: []
            }
        ],
    },
    # SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []},
    # RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []},
    # FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []},
    # PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []},
    ONTOBIOTOPE: {
        "acronym": "OTD",
        "id_field": "ID",
        VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"},
        COLUMNS: [
            {
                FIELD: "ID",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "OTD01"},
                    {TYPE: MISSING, ERROR_CODE: "OTD02"},
                ]
            },
            {
                FIELD: "Name",
                VALIDATION: [
                    {TYPE: MANDATORY, ERROR_CODE: "OTD03"},
                    {TYPE: MISSING, ERROR_CODE: "OTD04"},
                ]
            },
        ]
    },
    MARKERS: {
         "acronym": "MKD",
         "id_field": "Acronym",
         COLUMNS: [
             {
                 FIELD: "Acronym",
                 VALIDATION: []
             },
             {
                 FIELD: "Marker",
                 VALIDATION: []
             },
         ],
     },
 }
 CROSS_REF_CONF = {
    ONTOBIOTOPE: ['ID', 'Name'],
    LITERATURE_SHEET: ['ID'],
    LOCATIONS: ['Locality'],
    GROWTH_MEDIA: ['Acronym'],
    STRAINS: ["Accession number"],
    SEXUAL_STATE_SHEET: [],
    MARKERS: ["Acronym"],
 }
 MIRRI_20200601_VALLIDATION_CONF = {
    'sheet_schema': SHEETS_SCHEMA,
    'cross_ref_conf': CROSS_REF_CONF,
    'keep_sheets_in_memory': [
        {'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
 }