diff --git a/__pycache__/__init__.cpython-311.pyc b/__pycache__/__init__.cpython-311.pyc index c57a87e..d615b6f 100644 Binary files a/__pycache__/__init__.cpython-311.pyc and b/__pycache__/__init__.cpython-311.pyc differ diff --git a/__pycache__/settings.cpython-311.pyc b/__pycache__/settings.cpython-311.pyc index 542840d..42e183f 100644 Binary files a/__pycache__/settings.cpython-311.pyc and b/__pycache__/settings.cpython-311.pyc differ diff --git a/__pycache__/settings_v1.cpython-311.pyc b/__pycache__/settings_v1.cpython-311.pyc index 7ee855f..b6d3af8 100644 Binary files a/__pycache__/settings_v1.cpython-311.pyc and b/__pycache__/settings_v1.cpython-311.pyc differ diff --git a/io/parsers/__pycache__/__init__.cpython-311.pyc b/io/parsers/__pycache__/__init__.cpython-311.pyc index a3a1711..d890dbd 100644 Binary files a/io/parsers/__pycache__/__init__.cpython-311.pyc and b/io/parsers/__pycache__/__init__.cpython-311.pyc differ diff --git a/io/parsers/__pycache__/excel.cpython-311.pyc b/io/parsers/__pycache__/excel.cpython-311.pyc index 5189d49..35f15e6 100644 Binary files a/io/parsers/__pycache__/excel.cpython-311.pyc and b/io/parsers/__pycache__/excel.cpython-311.pyc differ diff --git a/io/parsers/mirri_excel.py b/io/parsers/mirri_excel.py index d65d1ce..ac1b141 100644 --- a/io/parsers/mirri_excel.py +++ b/io/parsers/mirri_excel.py @@ -19,12 +19,6 @@ from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO, NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, ONLY_RESEARCH, ONTOBIOTOPE, PUBLICATION_FIELDS, STRAINS, SUBTAXAS) -from mirri.settings_v1 import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO, - GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS, - MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS, - NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, - ONLY_RESEARCH, ONTOBIOTOPE, - PUBLICATION_FIELDS, STRAINS, SUBTAXAS) from mirri.utils import get_country_from_name RESTRICTION_USE_TRANSLATOR = { @@ -44,34 +38,12 @@ TRUEFALSE_TRANSLATOR = { def parse_mirri_excel(fhand, version=""): - if version == "20200602": - return _parse_mirri_v20200601(fhand) - elif version == "12052023": + if version == "5.1.2": return _parse_mirri_v12052023(fhand) else: - raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") + raise NotImplementedError("Only version is 5.1.2 implemented") -def _parse_mirri_v20200601(fhand): - fhand.seek(0) - file_content = BytesIO(fhand.read()) - wb = load_workbook(filename=file_content, read_only=True, data_only=True) - - locations = workbook_sheet_reader(wb, LOCATIONS) - ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE) - - growth_media = list(parse_growth_media(wb)) - - markers = workbook_sheet_reader(wb, GENOMIC_INFO) - - publications = list(parse_publications(wb)) - - strains = parse_strains(wb, locations=locations, growth_media=growth_media, - markers=markers, publications=publications, - ontobiotopes=ontobiotopes) - - return {"strains": strains, "growth_media": growth_media} - def _parse_mirri_v12052023(fhand): fhand.seek(0) file_content = BytesIO(fhand.read()) diff --git a/io/writers/mirri_excel.py b/io/writers/mirri_excel.py index 89e0b4a..c66e99d 100644 --- a/io/writers/mirri_excel.py +++ b/io/writers/mirri_excel.py @@ -5,7 +5,6 @@ from openpyxl.workbook.workbook import Workbook from mirri import rgetattr from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS -from mirri.settings_v1 import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR INITIAL_SEXUAL_STATES = [ @@ -51,81 +50,9 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS] def write_mirri_excel(path, strains, growth_media, version): - if version == "20200601": - _write_mirri_excel_20200601(path, strains, growth_media) - - if version == "12052023": + if version == "5.1.2": _write_mirri_excel_12052023(path, strains, growth_media) - -def _write_mirri_excel_20200601(path, strains, growth_media): - wb = Workbook() - - write_markers_sheet(wb) - - ontobiotope_path = DATA_DIR / "ontobiotopes.csv" - write_ontobiotopes(wb, ontobiotope_path) - - write_growth_media(wb, growth_media) - growth_media_indexes = [str(gm.acronym) for gm in growth_media] - - locations = {} - publications = {} - sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES)) - genomic_markers = {} - strains_data = _deserialize_strains(strains, locations, growth_media_indexes, - publications, sexual_states, genomic_markers) - strains_data = list(strains_data) - - # write strain to generate indexed data - strain_sheet = wb.create_sheet("Strains") - strain_sheet.append([field["label"] for field in MIRRI_FIELDS]) - for strain_row in strains_data: - strain_sheet.append(strain_row) - redimension_cell_width(strain_sheet) - - # write locations - loc_sheet = wb.create_sheet("Geographic origin") - loc_sheet.append(["ID", "Country", "Region", "City", "Locality"]) - for index, loc_index in enumerate(locations.keys()): - location = locations[loc_index] - row = [index, location.country, location.state, location.municipality, - loc_index] - loc_sheet.append(row) - redimension_cell_width(loc_sheet) - - # write publications - pub_sheet = wb.create_sheet("Literature") - pub_sheet.append(PUB_HEADERS) - for publication in publications.values(): - row = [] - for pub_field in PUBLICATION_FIELDS: - # if pub_field['attribute'] == 'id': - # value = index - value = getattr(publication, pub_field['attribute'], None) - row.append(value) - pub_sheet.append(row) - redimension_cell_width(pub_sheet) - - # write sexual states - sex_sheet = wb.create_sheet("Sexual state") - for sex_state in sorted(list(sexual_states)): - sex_sheet.append([sex_state]) - redimension_cell_width(sex_sheet) - - # write genetic markers - markers_sheet = wb.create_sheet("Genomic information") - markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence']) - for strain_id, markers in genomic_markers.items(): - for marker in markers: - row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq] - markers_sheet.append(row) - redimension_cell_width(markers_sheet) - - del wb["Sheet"] - wb.save(str(path)) - - def _write_mirri_excel_12052023(path, strains, growth_media): wb = Workbook() diff --git a/settings_v1.py b/settings_v1.py deleted file mode 100644 index 394be32..0000000 --- a/settings_v1.py +++ /dev/null @@ -1,311 +0,0 @@ -from pathlib import Path - -from charset_normalizer import VERSION - -DATA_DIR = Path(__file__).parent / "data" - -ACCESSION_NUMBER = "accession_number" -RESTRICTION_ON_USE = "restriction_on_use" -NAGOYA_PROTOCOL = "nagoya_protocol" -ABS_RELATED_FILES = "abs_related_files" -MTA_FILES = "mta_file" -OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers" -STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection" -RISK_GROUP = "risk_group" -DUAL_USE = "dual_use" -QUARANTINE = "quarantine" -ORGANISM_TYPE = "organism_type" -TAXON_NAME = "taxon_name" -INFRASUBSPECIFIC_NAME = "infrasubspecific_names" -COMMENTS_ON_TAXONOMY = "comments_on_taxonomy" -STATUS = "status" -HISTORY_OF_DEPOSIT = "history_of_deposit" -DEPOSITOR = "depositor" -DATE_OF_DEPOSIT = "date_of_deposit" -COLLECTED_BY = "collected_by" -DATE_OF_COLLECTION = "date_of_collection" -ISOLATED_BY = "isolated_by" -DATE_OF_ISOLATION = "date_of_isolation" -DATE_OF_INCLUSION = "date_of_inclusion_on_catalog" -TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range" -RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature" -RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth" -FORM_OF_SUPPLY = "form_of_supply" -GEO_COORDS = "coordinates_of_geographic_origin" -ACCESSION_NAME = "other_denomination" -ALTITUDE = "altitude_of_geographic_origin" -GEOGRAPHIC_ORIGIN = "geographic_origin" -GMO = "gmo" -GMO_CONSTRUCTION_INFO = "gmo_construction_information" -MUTANT_INFORMATION = "mutant_information" -GENOTYPE = "genotype" -LITERATURE = "literature" -SEXUAL_STATE = "sexual_state" -PLOIDY = "ploidy" -INTERSPECIFIC_HYBRID = "interspecific_hybrid" -HYBRIDS = 'hybrids' -PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code" -PATHOGENICITY = "pathogenicity" -ENZYME_PRODUCTION = "enzyme_production" -PRODUCTION_OF_METABOLITES = "production_of_metabolites" -APPLICATIONS = "applications" -REMARKS = "remarks" -PLASMIDS = "plasmids" -PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields" -SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation" -ISOLATION_HABITAT = "isolation_habitat" -ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat" -LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome" - -# StrainId -STRAIN_ID = "id" -COLLECTION_CODE = "collection_code" -STRAIN_PUI = "strain_pui" -STRAIN_URL = "strain_url" - -ID_SYNONYMS = 'id_synonyms' -# Taxonomy -GENUS = "genus" -SPECIES = "species" - -# Location -COUNTRY = "countryOfOriginCode" -SITE = "site" -STATE = "state" -PROVINCE = "province" -MUNICIPALITY = "municipality" -ISLAND = "island" -OTHER = "other" -LATITUDE = "latitude" -LONGITUDE = "longitude" -ALTITUDE = "altitude" -GEOREF_METHOD = "georeferencingMethod" -COORDUNCERTAINTY = "coordUncertainty" -COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference" -LOCATION = "location" - -ALLOWED_COLLECTING_SITE_KEYS = [ - COUNTRY, - STATE, - PROVINCE, - ISLAND, - MUNICIPALITY, - OTHER, - SITE, - LATITUDE, - LONGITUDE, - ALTITUDE, - GEOREF_METHOD, - COORDUNCERTAINTY, - COORD_SPATIAL_REFERENCE, -] - -MIRRI_FIELDS = [ - {"attribute": "id", "label": "Accession number"}, - {"attribute": "restriction_on_use", "label": "Restrictions on use"}, - {"attribute": "nagoya_protocol", - "label": "Nagoya protocol restrictions and compliance conditions"}, - {"attribute": ABS_RELATED_FILES, "label": "ABS related files"}, - {"attribute": "mta_files", "label": "MTA file"}, - {"attribute": "other_numbers", "label": "Other culture collection numbers"}, - {"attribute": "is_from_registered_collection", - "label": "Strain from a registered collection"}, - {"attribute": "risk_group", "label": "Risk Group"}, - {"attribute": "is_potentially_harmful", "label": "Dual use"}, - {"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"}, - {"attribute": "taxonomy.organism_type", "label": "Organism type"}, - {"attribute": "taxonomy.taxon_name", "label": "Taxon name"}, - {"attribute": "taxonomy.infrasubspecific_name", - "label": "Infrasubspecific names"}, - {"attribute": "taxonomy.comments", "label": "Comment on taxonomy"}, - {"attribute": "taxonomy.interspecific_hybrid", - "label": "Interspecific hybrid"}, - {"attribute": "status", "label": "Status"}, - {"attribute": "history", "label": "History of deposit", }, - {"attribute": "deposit.who", "label": "Depositor"}, - {"attribute": "deposit.date", "label": "Date of deposit"}, - {"attribute": "catalog_inclusion_date", - "label": "Date of inclusion in the catalogue"}, - {"attribute": "collect.who", "label": "Collected by"}, - {"attribute": "collect.date", "label": "Date of collection"}, - {"attribute": "isolation.who", "label": "Isolated by"}, - {"attribute": "isolation.date", "label": "Date of isolation"}, - {"attribute": "isolation.substrate_host_of_isolation", - "label": "Substrate/host of isolation"}, - {"attribute": "growth.tested_temp_range", - "label": "Tested temperature growth range"}, - {"attribute": "growth.recommended_temp", - "label": "Recommended growth temperature"}, - {"attribute": "growth.recommended_media", - "label": "Recommended medium for growth"}, - {"attribute": "form_of_supply", "label": "Form of supply"}, - {"attribute": "other_denominations", "label": "Other denomination"}, - {"attribute": "collect.location.coords", - "label": "Coordinates of geographic origin"}, - {"attribute": "collect.location.altitude", - "label": "Altitude of geographic origin"}, - {"attribute": "collect.location", "label": "Geographic origin"}, - {"attribute": "collect.habitat", "label": "Isolation habitat"}, - {"attribute": "collect.habitat_ontobiotope", - "label": "Ontobiotope term for the isolation habitat"}, - {"attribute": "genetics.gmo", "label": "GMO"}, - {"attribute": "genetics.gmo_construction", - "label": "GMO construction information"}, - {"attribute": "genetics.mutant_info", "label": "Mutant information"}, - {"attribute": "genetics.genotype", "label": "Genotype"}, - {"attribute": "genetics.sexual_state", "label": "Sexual state"}, - {"attribute": "genetics.ploidy", "label": "Ploidy"}, - {"attribute": "genetics.plasmids", "label": "Plasmids"}, - {"attribute": "genetics.plasmids_in_collections", - "label": "Plasmids collections fields"}, - {"attribute": "publications", "label": "Literature"}, - {"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"}, - {"attribute": "pathogenicity", "label": "Pathogenicity"}, - {"attribute": "enzyme_production", "label": "Enzyme production"}, - {"attribute": "production_of_metabolites", - "label": "Production of metabolites"}, - {"attribute": "applications", "label": "Applications", }, - {"attribute": "remarks", "label": "Remarks"}, - {"attribute": LITERATURE_LINKED_TO_SEQ_GENOME, - "label": "Literature linked to the sequence/genome"}, -] - -ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma", - 'forma.specialis'] -ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA - -# nagoya -NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol" -NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection" -NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection" - -ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS, - NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE] - -# Use restriction -NO_RESTRICTION = "no_restriction" -ONLY_RESEARCH = "only_research" -COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement" - -ALLOWED_RESTRICTION_USE_OPTIONS = [ - NO_RESTRICTION, - ONLY_RESEARCH, - COMMERCIAL_USE_WITH_AGREEMENT, -] - -ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"] - -AGAR = "Agar" -CRYO = "Cryo" -DRY_ICE = "Dry Ice" -LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium" -LYO = "Lyo" -OIL = "Oil" -WATER = "Water" -ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE, - LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER] - -DEPOSIT = "deposit" -ISOLATION = "isolation" -COLLECT = "collect" -GROWTH = "growth" -GENETICS = "genetics" -TAXONOMY = "taxonomy" -# Markers -MARKERS = "markers" -MARKER_TYPE = "marker_type" -MARKER_INSDC = "INSDC" -MARKER_SEQ = "marker_seq" -ALLOWED_MARKER_TYPES = [ - {"acronym": "16S rRNA", "marker": "16S rRNA"}, - {"acronym": "ACT", "marker": "Actin"}, - {"acronym": "CaM", "marker": "Calmodulin"}, - {"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"}, - {"acronym": "ITS", - "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"}, - {"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"}, - {"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"}, - {"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"}, - {"acronym": "TUBB", "marker": "β-Tubulin"}, -] - -PUBLICATIONS = "publications" -PUB_ID = "id" -PUB_DOI = "pub_doi" -PUB_PUBMED_ID = '' -PUB_FULL_REFERENCE = "full_reference" -PUB_TITLE = "title" -PUB_AUTHORS = "authors" -PUB_JOURNAL = "journal" -PUB_YEAR = "year" -PUB_VOLUME = "volume" -PUB_ISSUE = "issue" -PUB_FIRST_PAGE = "first_page" -PUB_LAST_PAGE = "last_page" -BOOK_TITLE = "book_title" -BOOK_EDITOR = "book_editor" -BOOK_PUBLISHER = "book_publisher" - - -PUBLICATION_FIELDS = [ - {"label": "ID", "attribute": PUB_ID}, - {"label": "Full reference", "attribute": PUB_FULL_REFERENCE}, - {"label": "Authors", "attribute": PUB_AUTHORS}, - {"label": "Title", "attribute": PUB_TITLE}, - {"label": "Journal", "attribute": PUB_JOURNAL}, - {"label": "Year", "attribute": PUB_YEAR}, - {"label": "Volume", "attribute": PUB_VOLUME}, - {"label": "Issue", "attribute": PUB_ISSUE}, - {"label": "First page", "attribute": PUB_FIRST_PAGE}, - {"label": "Last page", "attribute": PUB_FIRST_PAGE}, - {"label": "Book title", "attribute": BOOK_TITLE}, - {"label": "Editors", "attribute": BOOK_EDITOR}, - {"label": "Publisher", "attribute": BOOK_PUBLISHER}, -] - - -# ploidy -ANEUPLOID = 0 -HAPLOID = 1 -DIPLOID = 2 -TRIPLOID = 3 -TETRAPLOID = 4 -POLYPLOID = 9 - -ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID, - POLYPLOID] - -SUBTAXAS = { - "subsp.": "subspecies", - "var.": "variety", - "convar.": "convarietas", - "group.": "group", - "f.": "forma", - "f.sp.": "forma.specialis" -} - - -#Control -VERSION = "Version" -DATE = "Date" - - -#Controle files -CONTROL_FIELDS = [ - {"label": "Version", "attribute": VERSION}, - {"label": "Date", "attribute": DATE}, -] - -# Excel sheet name -LOCATIONS = "Geographic origin" # 'Locations' -GROWTH_MEDIA = "Growth media" -GENOMIC_INFO = "Genomic information" -STRAINS = "Strains" -LITERATURE_SHEET = "Literature" -SEXUAL_STATE_SHEET = "Sexual state" -RESOURCE_TYPES_VALUES = "Resource types values" -FORM_OF_SUPPLY_SHEET = "Forms of supply" -PLOIDY_SHEET = "Ploidy" -ONTOBIOTOPE = "Ontobiotope" -MARKERS = "Markers" -CONTROL_SHEET = "Version" diff --git a/validation/2B90F320 b/validation/2B90F320 deleted file mode 100644 index 960fd7a..0000000 Binary files a/validation/2B90F320 and /dev/null differ diff --git a/validation/B3F84180 b/validation/B3F84180 deleted file mode 100644 index c303ee1..0000000 Binary files a/validation/B3F84180 and /dev/null differ diff --git a/validation/__pycache__/__init__.cpython-311.pyc b/validation/__pycache__/__init__.cpython-311.pyc index 62112ef..91c67bd 100644 Binary files a/validation/__pycache__/__init__.cpython-311.pyc and b/validation/__pycache__/__init__.cpython-311.pyc differ diff --git a/validation/__pycache__/excel_validator.cpython-311.pyc b/validation/__pycache__/excel_validator.cpython-311.pyc index b7c96b7..f7a65ec 100644 Binary files a/validation/__pycache__/excel_validator.cpython-311.pyc and b/validation/__pycache__/excel_validator.cpython-311.pyc differ diff --git a/validation/__pycache__/tags.cpython-311.pyc b/validation/__pycache__/tags.cpython-311.pyc index cc3cf34..97ef7c5 100644 Binary files a/validation/__pycache__/tags.cpython-311.pyc and b/validation/__pycache__/tags.cpython-311.pyc differ diff --git a/validation/__pycache__/validate_v5.cpython-311.pyc b/validation/__pycache__/validate_v5.cpython-311.pyc index ac53165..baa23e2 100644 Binary files a/validation/__pycache__/validate_v5.cpython-311.pyc and b/validation/__pycache__/validate_v5.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_12052023.cpython-311.pyc b/validation/__pycache__/validation_conf_12052023.cpython-311.pyc index 8349360..23f2165 100644 Binary files a/validation/__pycache__/validation_conf_12052023.cpython-311.pyc and b/validation/__pycache__/validation_conf_12052023.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_20200601.cpython-311.pyc b/validation/__pycache__/validation_conf_20200601.cpython-311.pyc index 512676a..d141cb0 100644 Binary files a/validation/__pycache__/validation_conf_20200601.cpython-311.pyc and b/validation/__pycache__/validation_conf_20200601.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/__init__.cpython-311.pyc b/validation/error_logging/__pycache__/__init__.cpython-311.pyc index 7e36c7c..ebece39 100644 Binary files a/validation/error_logging/__pycache__/__init__.cpython-311.pyc and b/validation/error_logging/__pycache__/__init__.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error.cpython-311.pyc b/validation/error_logging/__pycache__/error.cpython-311.pyc index 6413d86..a72fea9 100644 Binary files a/validation/error_logging/__pycache__/error.cpython-311.pyc and b/validation/error_logging/__pycache__/error.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error_log.cpython-311.pyc b/validation/error_logging/__pycache__/error_log.cpython-311.pyc index c00d750..a3da8c5 100644 Binary files a/validation/error_logging/__pycache__/error_log.cpython-311.pyc and b/validation/error_logging/__pycache__/error_log.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error_message.cpython-311.pyc b/validation/error_logging/__pycache__/error_message.cpython-311.pyc index 0483c6d..d066fd5 100644 Binary files a/validation/error_logging/__pycache__/error_message.cpython-311.pyc and b/validation/error_logging/__pycache__/error_message.cpython-311.pyc differ diff --git a/validation/excel_validator.py b/validation/excel_validator.py index bc51755..73ec3ad 100644 --- a/validation/excel_validator.py +++ b/validation/excel_validator.py @@ -16,20 +16,19 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE, HISTORY,NAGOYA1, VERSION) from mirri.settings import LOCATIONS, SUBTAXAS -from mirri.settings_v1 import LOCATIONS, SUBTAXAS from mirri.validation.validation_conf_12052023 import version_config +from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF -def validate_mirri_excel(fhand, version="", date=""): - configuration = version_config.get(version) - if configuration is None: - raise NotImplementedError("Unsupported version: " + version) - configuration["date"] = date or configuration.get("date") - if configuration["date"] != "12/05/2023": - raise ValueError("Invalid date. Expected: 12/05/2023") - return validate_excel(fhand, configuration) - + +def validate_mirri_excel(fhand, version= "5.1.2" ): + if version == "5.1.2": + configuration = MIRRI_12052023_VALLIDATION_CONF + else: + raise NotImplementedError("Only version 5.1.2 is implemented") + return validate_excel(fhand, configuration) + def version(value , validation_conf=None): if value is None: return True @@ -210,8 +209,6 @@ def validate_row(row, validation_steps, in_memory_sheets): kind = validation_step[TYPE] error_code = validation_step[ERROR_CODE] if kind == NAGOYA: - if not is_valid_nagoya_v20200601(row, in_memory_sheets): - return error_code if not is_valid_nagoya_v12052023(row, in_memory_sheets): return error_code elif kind == BIBLIO: @@ -281,39 +278,10 @@ def is_valid_nago(row): return True def parsee_mirri_excel(row, in_memory_sheets, version=""): - if version == "20200601": - return is_valid_nagoya_v20200601 (row, in_memory_sheets) - elif version == "12052023": + if version == "12052023": return is_valid_nagoya_v12052023 (row, in_memory_sheets) else: - raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") - -def is_valid_nagoya_v20200601(row, in_memory_sheets): # sourcery skip: return-identity - location_index = row.get('Geographic origin', None) - if location_index is None: - country = None - else: - geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {}) - country = geo_origin.get('Country', None) - - _date = row.get("Date of collection", None) - if _date is None: - _date = row.get("Date of isolation", None) - if _date is None: - _date = row.get("Date of deposit", None) - if _date is None: - _date = row.get("Date of inclusion in the catalogue", None) - if _date is not None: - year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4]) - else: - year = None - - if year is not None and year >= 2014 and country is None: - return False - - - - return True + raise NotImplementedError("Only version is implemented") def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity location_index = row.get('geographicOrigin', None) diff --git a/validation/validate_v5.py b/validation/validate_v5.py index 3cd828a..290fc00 100644 --- a/validation/validate_v5.py +++ b/validation/validate_v5.py @@ -9,10 +9,9 @@ from mirri.validation.excel_validator import validate_mirri_excel def main(): path = Path(sys.argv[1]) version = str(sys.argv[2]) - date = str(sys.argv[3]) try: - error_log = validate_mirri_excel(path.open("rb"), version=version, date=date) + error_log = validate_mirri_excel(path.open("rb"), version=version) except NotImplementedError as e: print(e) diff --git a/validation/validation_conf_12052023.py b/validation/validation_conf_12052023.py index 6c60db0..f656d14 100644 --- a/validation/validation_conf_12052023.py +++ b/validation/validation_conf_12052023.py @@ -4,7 +4,7 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS UNIQUE,VERSION, VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1) from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, - STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET,) + STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET) @@ -323,7 +323,7 @@ STRAIN_FIELDS = [ { FIELD: "plasmidCollections", VALIDATION: [ - {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$", + {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$", ERROR_CODE: "STD62"} ] }, @@ -358,7 +358,7 @@ STRAIN_FIELDS = [ { FIELD: "sequenceLiterature", VALIDATION: [ - {TYPE: REGEXP, MATCH: "^\d+(;?\s*\d+)*$", ERROR_CODE: "STD61"}, + {TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"}, ] }, diff --git a/validation/validation_conf_20200601.py b/validation/validation_conf_20200601.py deleted file mode 100644 index 5f667a9..0000000 --- a/validation/validation_conf_20200601.py +++ /dev/null @@ -1,545 +0,0 @@ -from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, - ERROR_CODE, FIELD, MANDATORY, MATCH, - MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, - UNIQUE, - VALIDATION, VALUES, BIBLIO) -from mirri.settings_v1 import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, - STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS) -# GEOGRAPHIC_ORIGIN -# SEXUAL_STATE_SHEET, -# RESOURCE_TYPES_VALUES, -# FORM_OF_SUPPLY_SHEET, -# PLOIDY_SHEET) - - -STRAIN_FIELDS = [ - { - FIELD: "Accession number", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: 'STD01'}, - {TYPE: UNIQUE, ERROR_CODE: 'STD03'}, - {TYPE: MISSING, ERROR_CODE: "STD02"}, - {TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"} - ] - }, - { - FIELD: "Restrictions on use", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD05"}, - {TYPE: MISSING, ERROR_CODE: "STD06"}, - {TYPE: CHOICES, VALUES: ["1", "2", "3"], - MULTIPLE: False, ERROR_CODE: "STD07"} - ] - }, - { - FIELD: "Nagoya protocol restrictions and compliance conditions", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD08"}, - {TYPE: MISSING, ERROR_CODE: "STD09"}, - {TYPE: CHOICES, VALUES: ["1", "2", "3"], - MULTIPLE: False, ERROR_CODE: "STD10"} - ] - }, - { - FIELD: "ABS related files", - VALIDATION: [], - }, - { - FIELD: "MTA file", - VALIDATION: [], - }, - { - FIELD: "Other culture collection numbers", - # VALIDATION: [ - # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07", - # MULTIPLE: True, SEPARATOR: ";"} - # ] - }, - { - FIELD: "Strain from a registered collection", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["1", "2"], - ERROR_CODE: "STD11"} - ] - }, - { - FIELD: "Risk Group", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD12"}, - {TYPE: MISSING, ERROR_CODE: "STD13"}, - {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"], - MULTIPLE: False, ERROR_CODE: "STD14"} - ] - }, - { - FIELD: "Dual use", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["1", "2"], - ERROR_CODE: "STD15"} - ] - }, - { - FIELD: "Quarantine in Europe", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["1", "2"], - ERROR_CODE: "STD16"} - ] - }, - { - FIELD: "Organism type", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD17"}, - {TYPE: MISSING, ERROR_CODE: "STD18"}, - {TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria", - "Cyanobacteria", "Filamentous Fungi", - "Phage", "Plasmid", "Virus", "Yeast", - "1", "2", "3", "4", "5", "6", "7", "8", "9"], - MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"} - ] - }, - { - FIELD: "Taxon name", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD20"}, - {TYPE: MISSING, ERROR_CODE: "STD21"}, - {TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True, - SEPARATOR: ';'} - ] - }, - { - FIELD: "Infrasubspecific names", - }, - { - FIELD: "Comment on taxonomy", - }, - { - FIELD: "Interspecific hybrid", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["1", "2"], - ERROR_CODE: "STD23"} - ] - }, - { - FIELD: "Status", - }, - { - FIELD: "History of deposit", - VALIDATION: [ - # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex - # MULTIPLE: True, SEPARATOR: ";"} - ] - }, - { - FIELD: "Depositor" - }, - { - FIELD: "Date of deposit", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD25"}, - ] - }, - { - FIELD: "Date of inclusion in the catalogue", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD26"}, - ] - }, - { - FIELD: "Collected by", - }, - { - FIELD: "Date of collection", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD27"}, - ] - }, - { - FIELD: "Isolated by", - }, - { - FIELD: "Date of isolation", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD28"}, - ] - }, - { - FIELD: "Substrate/host of isolation", - }, - { - FIELD: "Tested temperature growth range", - VALIDATION: [ - {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', - ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"} - ] - }, - { - FIELD: "Recommended growth temperature", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD30"}, - {TYPE: MISSING, ERROR_CODE: "STD31"}, - {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', - ERROR_CODE: "STD32", - MULTIPLE: True, SEPARATOR: ";"} - ] - }, - { - FIELD: "Recommended medium for growth", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD33"}, - {TYPE: MISSING, ERROR_CODE: "STD34"}, - {TYPE: CROSSREF, CROSSREF_NAME: "Growth media", - MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"} - ] - }, - { - FIELD: "Form of supply", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD36"}, - {TYPE: MISSING, ERROR_CODE: "STD37"}, - {TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium', - 'Lyo', 'Oil', 'Water'], - MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"} - ] - }, - { - FIELD: "Other denomination", - }, - { - FIELD: "Coordinates of geographic origin", - VALIDATION: [ - {TYPE: COORDINATES, ERROR_CODE: "STD39"}, - ] - }, - { - FIELD: "Altitude of geographic origin", - VALIDATION: [ - {TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"}, - ] - }, - { - # value can be in the cell or in another sheet. Don't configure this - FIELD: "Geographic origin", - }, - { - FIELD: "Isolation habitat", - }, - { - FIELD: "Ontobiotope term for the isolation habitat", - VALIDATION: [ - {TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope", - MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"} - ] - }, - { - FIELD: "GMO", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["1", "2"], - ERROR_CODE: "STD42"} - ] - }, - { - FIELD: "GMO construction information", - }, - { - FIELD: "Mutant information", - }, - { - FIELD: "Genotype", - }, - { - FIELD: "Sexual state", - VALIDATION: [ - {TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET, - ERROR_CODE: "STD43"} - # {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha", - # "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha", - # "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"], - # ERROR_CODE: "STD43"} - ] - }, - { - FIELD: "Ploidy", - VALIDATION: [ - {TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"], - ERROR_CODE: "STD44"} - ] - }, - { - FIELD: "Plasmids", - }, - { - FIELD: "Plasmids collections fields", - }, - { - # value can be in the cell or in another sheet. Don't configure this - FIELD: "Literature", - VALIDATION: [ - {TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET, - MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"} - ] - }, - { - FIELD: "Plant pathogenicity code", - }, - { - FIELD: "Pathogenicity", - }, - { - FIELD: "Enzyme production", - }, - { - FIELD: "Production of metabolites", - }, - { - FIELD: "Applications", - }, - { - FIELD: "Remarks" - }, - { - FIELD: "Literature linked to the sequence/genome", - }, -] -SHEETS_SCHEMA = { - LOCATIONS: { - "acronym": "GOD", - "id_field": "ID", - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"}, - COLUMNS: [ - { - FIELD: "ID", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GOD01"}, - {TYPE: MISSING, ERROR_CODE: "GOD02"}, - ] - }, - { - FIELD: "Country", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GOD03"}, - {TYPE: MISSING, ERROR_CODE: "GOD04"} - ] - }, - { - FIELD: "Region", - VALIDATION: [] - }, - { - FIELD: "City", - VALIDATION: [] - }, - { - FIELD: "Locality", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GOD06"}, - {TYPE: MISSING, ERROR_CODE: "GOD07"} - ] - } - ], - }, - GROWTH_MEDIA: { - "acronym": "GMD", - "id_field": "Acronym", - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"}, - COLUMNS: [ - { - FIELD: "Acronym", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GMD01"}, - {TYPE: MISSING, ERROR_CODE: "GMD02"} - ] - }, - { - FIELD: "Description", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GMD03"}, - {TYPE: MISSING, ERROR_CODE: "GMD04"} - ] - }, - { - FIELD: "Full description", - VALIDATION: [] - }, - ], - }, - GENOMIC_INFO: { - "acronym": "GID", - "id_field": "Strain AN", - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"}, - COLUMNS: [ - { - FIELD: "Strain AN", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GID01"}, - {TYPE: MISSING, ERROR_CODE: "GID02"}, - {TYPE: CROSSREF, CROSSREF_NAME: "Strains", - ERROR_CODE: "GID03"}, - ] - }, - { - FIELD: "Marker", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GID04"}, - {TYPE: MISSING, ERROR_CODE: "GID05"}, - {TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"} - ] - }, - { - FIELD: "INSDC AN", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "GID07"}, - {TYPE: MISSING, ERROR_CODE: "GID08"}, - ] - }, - { - FIELD: "Sequence", - VALIDATION: [] - }, - ], - }, - STRAINS: { - "acronym": "STD", - 'id_field': 'Accession number', - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"}, - ROW_VALIDATION: [ - {TYPE: NAGOYA, ERROR_CODE: "STD46"}, - ], - COLUMNS: STRAIN_FIELDS, - }, - LITERATURE_SHEET: { - "acronym": "LID", - 'id_field': 'ID', - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"}, - ROW_VALIDATION: [ - {TYPE: BIBLIO, ERROR_CODE: 'LID17'} - ], - COLUMNS: [ - { - FIELD: "ID", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID01"}, - {TYPE: MISSING, ERROR_CODE: "LID02"}, - ] - }, - { - FIELD: "Full reference", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID03"}, - ] - }, - { - FIELD: "Authors", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID05"}, - ] - }, - { - FIELD: "Title", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID07"}, - ] - }, - { - FIELD: "Journal", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID09"}, - ] - }, - { - FIELD: "Year", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID11"}, - ] - }, - { - FIELD: "Volume", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID13"}, - ] - }, - { - FIELD: "Issue", - VALIDATION: [] - }, - { - FIELD: "First page", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "LID15"}, - {TYPE: MISSING, ERROR_CODE: "LID16"}, - ] - }, - { - FIELD: "Last page", - VALIDATION: [] - }, - { - FIELD: "Book title", - VALIDATION: [] - }, - { - FIELD: "Editors", - VALIDATION: [] - }, - { - FIELD: "Publisher", - VALIDATION: [] - } - ], - }, - # SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []}, - # RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []}, - # FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []}, - # PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []}, - ONTOBIOTOPE: { - "acronym": "OTD", - "id_field": "ID", - VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"}, - COLUMNS: [ - { - FIELD: "ID", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "OTD01"}, - {TYPE: MISSING, ERROR_CODE: "OTD02"}, - ] - }, - { - FIELD: "Name", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "OTD03"}, - {TYPE: MISSING, ERROR_CODE: "OTD04"}, - ] - }, - ] - }, - MARKERS: { - "acronym": "MKD", - "id_field": "Acronym", - COLUMNS: [ - { - FIELD: "Acronym", - VALIDATION: [] - }, - { - FIELD: "Marker", - VALIDATION: [] - }, - ], - }, -} - -CROSS_REF_CONF = { - ONTOBIOTOPE: ['ID', 'Name'], - LITERATURE_SHEET: ['ID'], - LOCATIONS: ['Locality'], - GROWTH_MEDIA: ['Acronym'], - STRAINS: ["Accession number"], - SEXUAL_STATE_SHEET: [], - MARKERS: ["Acronym"], -} - -MIRRI_20200601_VALLIDATION_CONF = { - 'sheet_schema': SHEETS_SCHEMA, - 'cross_ref_conf': CROSS_REF_CONF, - 'keep_sheets_in_memory': [ - {'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}] -}