diff --git a/mirri/io/__init__.py b/mirri/io/__init___1.py similarity index 100% rename from mirri/io/__init__.py rename to mirri/io/__init___1.py diff --git a/mirri/io/parsers/mirri_excel.py b/mirri/io/parsers/mirri_excel.py index 549ae3d..ac1b141 100644 --- a/mirri/io/parsers/mirri_excel.py +++ b/mirri/io/parsers/mirri_excel.py @@ -37,14 +37,14 @@ TRUEFALSE_TRANSLATOR = { } -def parse_mirri_excel(fhand, version="20200601"): - if version == "20200601": - return _parse_mirri_v20200601(fhand) +def parse_mirri_excel(fhand, version=""): + if version == "5.1.2": + return _parse_mirri_v12052023(fhand) else: - raise NotImplementedError("Only version 20200601 is implemented") + raise NotImplementedError("Only version is 5.1.2 implemented") -def _parse_mirri_v20200601(fhand): +def _parse_mirri_v12052023(fhand): fhand.seek(0) file_content = BytesIO(fhand.read()) wb = load_workbook(filename=file_content, read_only=True, data_only=True) @@ -64,7 +64,6 @@ def _parse_mirri_v20200601(fhand): return {"strains": strains, "growth_media": growth_media} - def index_list_by(list_, id_): return {str(item[id_]): item for item in list_} @@ -125,7 +124,7 @@ def parse_strains(wb, locations, growth_media, markers, publications, publications = index_list_by_attr(publications, 'id') markers = index_markers(markers) - for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"): + for strain_row in workbook_sheet_reader(wb, STRAINS, "accessionNumber"): strain = StrainMirri() strain_id = None label = None @@ -140,7 +139,7 @@ def parse_strains(wb, locations, growth_media, markers, publications, collection, number = value.split(" ", 1) value = StrainId(collection=collection, number=number) rsetattr(strain, attribute, value) - + elif attribute == "restriction_on_use": rsetattr(strain, attribute, RESTRICTION_USE_TRANSLATOR[value]) elif attribute == "nagoya_protocol": @@ -202,9 +201,19 @@ def parse_strains(wb, locations, growth_media, markers, publications, items = value.split(";") strain.collect.location.latitude = float(items[0]) strain.collect.location.longitude = float(items[1]) + strain.collect.location.precision = float(items[2]) + strain.collect.location.altitude = float(items[3]) + if len(items) > 4: + strain.collect.location.coord_uncertainty = items[4] + + elif attribute == "collect.site.links": + items = value.split(";") + strain.collect.site.links.nameSite = str(items[0]) + strain.collect.site.links.urlSite = str(items[1]) + rsetattr(strain, attribute, value.split(";")) #ver o separador if len(items) > 2: - strain.collect.location.coord_uncertainty = items[2] - + strain.collect.site.links.site_uncertainty = items[2] + elif attribute == "collect.location": location = locations[value] if 'Country' in location and location['Country']: diff --git a/mirri/io/writers/mirri_excel.py b/mirri/io/writers/mirri_excel.py index b4cb4ac..c66e99d 100644 --- a/mirri/io/writers/mirri_excel.py +++ b/mirri/io/writers/mirri_excel.py @@ -50,11 +50,10 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS] def write_mirri_excel(path, strains, growth_media, version): - if version == "20200601": - _write_mirri_excel_20200601(path, strains, growth_media) + if version == "5.1.2": + _write_mirri_excel_12052023(path, strains, growth_media) - -def _write_mirri_excel_20200601(path, strains, growth_media): +def _write_mirri_excel_12052023(path, strains, growth_media): wb = Workbook() write_markers_sheet(wb) @@ -104,7 +103,7 @@ def _write_mirri_excel_20200601(path, strains, growth_media): redimension_cell_width(pub_sheet) # write sexual states - sex_sheet = wb.create_sheet("Sexual states") + sex_sheet = wb.create_sheet("Sexual state") for sex_state in sorted(list(sexual_states)): sex_sheet.append([sex_state]) redimension_cell_width(sex_sheet) @@ -121,7 +120,6 @@ def _write_mirri_excel_20200601(path, strains, growth_media): del wb["Sheet"] wb.save(str(path)) - def _deserialize_strains(strains, locations, growth_media_indexes, publications, sexual_states, genomic_markers): for strain in strains: @@ -189,10 +187,21 @@ def _deserialize_strains(strains, locations, growth_media_indexes, elif attribute == "collect.location.coords": lat = strain.collect.location.latitude long = strain.collect.location.longitude - if lat is not None and long is not None: - value = f"{lat};{long}" + alt = strain.collect.location.altitude + prec = strain.collect.location.precision + if lat is not None and long is not None and prec is not None and alt is not None: + value = f"{lat};{long};{prec};{alt}" else: - value = None + value = None + elif attribute == "collect.site.links": + name = strain.collect.site.links.nameSite + url = strain.collect.site.links.urlSite + value = rgetattr(strain, attribute) + value = ";".join(value) + if name is not None and url is not None: + value = f"{name};{url}" + else: + value = None elif attribute == "collect.location": location = strain.collect.location diff --git a/mirri/settings.py b/mirri/settings.py index 8f731f6..adca2e7 100644 --- a/mirri/settings.py +++ b/mirri/settings.py @@ -3,6 +3,7 @@ from pathlib import Path DATA_DIR = Path(__file__).parent / "data" ACCESSION_NUMBER = "accession_number" +MIRRI_ACCESSION_NUMBER = 'mirri_accession_number' RESTRICTION_ON_USE = "restriction_on_use" NAGOYA_PROTOCOL = "nagoya_protocol" ABS_RELATED_FILES = "abs_related_files" @@ -14,6 +15,7 @@ DUAL_USE = "dual_use" QUARANTINE = "quarantine" ORGANISM_TYPE = "organism_type" TAXON_NAME = "taxon_name" +TYPE = "type" INFRASUBSPECIFIC_NAME = "infrasubspecific_names" COMMENTS_ON_TAXONOMY = "comments_on_taxonomy" STATUS = "status" @@ -54,6 +56,9 @@ SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation" ISOLATION_HABITAT = "isolation_habitat" ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat" LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome" +AXENIC_CULTURE = "axenic_culture" +QPS ="qps" +SITE_LINK = "site_links" # StrainId STRAIN_ID = "id" @@ -99,73 +104,80 @@ ALLOWED_COLLECTING_SITE_KEYS = [ ] MIRRI_FIELDS = [ - {"attribute": "id", "label": "Accession number"}, - {"attribute": "restriction_on_use", "label": "Restrictions on use"}, + {"attribute": "id", "label": "accessionNumber"}, + {"attribute": "mirri_accession_number", "label": "mirriAccessionNumber"}, + {"attribute": "qps", "label": "qps"}, + {"attribute": "axenic_culture", "label": "axenicCulture"}, + {"attribute": "restriction_on_use", "label": "useRestrictions"}, {"attribute": "nagoya_protocol", - "label": "Nagoya protocol restrictions and compliance conditions"}, - {"attribute": ABS_RELATED_FILES, "label": "ABS related files"}, - {"attribute": "mta_files", "label": "MTA file"}, - {"attribute": "other_numbers", "label": "Other culture collection numbers"}, + "label": "nagoyaConditions"}, + {"attribute": ABS_RELATED_FILES, "label": "absFile"}, + {"attribute": "mta_files", "label": "mtaFile"}, + {"attribute": "other_numbers", "label": "otherCollectionNumbers"}, {"attribute": "is_from_registered_collection", - "label": "Strain from a registered collection"}, - {"attribute": "risk_group", "label": "Risk Group"}, - {"attribute": "is_potentially_harmful", "label": "Dual use"}, - {"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"}, - {"attribute": "taxonomy.organism_type", "label": "Organism type"}, - {"attribute": "taxonomy.taxon_name", "label": "Taxon name"}, + "label": "registeredCollection"}, + {"attribute": "risk_group", "label": "riskGroup"}, + {"attribute": "is_potentially_harmful", "label": "dualUse"}, + {"attribute": "is_subject_to_quarantine", "label": "euQuarantine"}, + {"attribute": "taxonomy.organism_type", "label": "organismType"}, + {"attribute": "taxonomy.taxon_name", "label": "speciesName"}, {"attribute": "taxonomy.infrasubspecific_name", - "label": "Infrasubspecific names"}, - {"attribute": "taxonomy.comments", "label": "Comment on taxonomy"}, + "label": "infrasubspecificNames"}, + {"attribute": "taxonomy.comments", "label": "taxonomyComments"}, {"attribute": "taxonomy.interspecific_hybrid", - "label": "Interspecific hybrid"}, - {"attribute": "status", "label": "Status"}, - {"attribute": "history", "label": "History of deposit", }, - {"attribute": "deposit.who", "label": "Depositor"}, - {"attribute": "deposit.date", "label": "Date of deposit"}, + "label": "hybrid"}, + {"attribute": "status", "label": "status"}, + {"attribute": "history", "label": "depositHistory", }, + {"attribute": "deposit.who", "label": "depositor"}, + {"attribute": "deposit.date", "label": "depositDate"}, {"attribute": "catalog_inclusion_date", - "label": "Date of inclusion in the catalogue"}, - {"attribute": "collect.who", "label": "Collected by"}, - {"attribute": "collect.date", "label": "Date of collection"}, - {"attribute": "isolation.who", "label": "Isolated by"}, - {"attribute": "isolation.date", "label": "Date of isolation"}, + "label": "accessionDate"}, + {"attribute": "collect.who", "label": "collector"}, + {"attribute": "collect.date", "label": "collectionDate"}, + {"attribute": "isolation.who", "label": "isolator"}, + {"attribute": "isolation.date", "label": "isolationDate"}, {"attribute": "isolation.substrate_host_of_isolation", - "label": "Substrate/host of isolation"}, + "label": "substrate"}, {"attribute": "growth.tested_temp_range", - "label": "Tested temperature growth range"}, + "label": "temperatureGrowthRange"}, {"attribute": "growth.recommended_temp", - "label": "Recommended growth temperature"}, + "label": "recommendedTemperature"}, {"attribute": "growth.recommended_media", - "label": "Recommended medium for growth"}, - {"attribute": "form_of_supply", "label": "Form of supply"}, - {"attribute": "other_denominations", "label": "Other denomination"}, + "label": "recommendedMedium"}, + {"attribute": "form_of_supply", "label": "supplyForms"}, + {"attribute": "other_denominations", "label": "otherDenomination"}, {"attribute": "collect.location.coords", - "label": "Coordinates of geographic origin"}, + "label": "geographicCoordinates"}, + {"attribute": "collect.site.links", + "label": "siteLinks"}, {"attribute": "collect.location.altitude", - "label": "Altitude of geographic origin"}, - {"attribute": "collect.location", "label": "Geographic origin"}, - {"attribute": "collect.habitat", "label": "Isolation habitat"}, + "label": "country"}, + {"attribute": "collect.location", "label": "geographicOrigin"}, + {"attribute": "collect.habitat", "label": "isolationHabitat"}, {"attribute": "collect.habitat_ontobiotope", - "label": "Ontobiotope term for the isolation habitat"}, - {"attribute": "genetics.gmo", "label": "GMO"}, + "label": "ontobiotopeTerms"}, + {"attribute": "genetics.gmo", "label": "gmo"}, {"attribute": "genetics.gmo_construction", - "label": "GMO construction information"}, - {"attribute": "genetics.mutant_info", "label": "Mutant information"}, - {"attribute": "genetics.genotype", "label": "Genotype"}, - {"attribute": "genetics.sexual_state", "label": "Sexual state"}, - {"attribute": "genetics.ploidy", "label": "Ploidy"}, - {"attribute": "genetics.plasmids", "label": "Plasmids"}, + "label": "gmoConstruction"}, + {"attribute": "genetics.mutant_info", "label": "mutant"}, + {"attribute": "genetics.genotype", "label": "genotype"}, + {"attribute": "genetics.sexual_state", "label": "sexualState"}, + {"attribute": "genetics.ploidy", "label": "ploidy"}, + {"attribute": "genetics.plasmids", "label": "plasmids"}, {"attribute": "genetics.plasmids_in_collections", - "label": "Plasmids collections fields"}, - {"attribute": "publications", "label": "Literature"}, + "label": "plasmidCollections"}, + {"attribute": "publications", "label": "identificationLiterature"}, {"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"}, - {"attribute": "pathogenicity", "label": "Pathogenicity"}, - {"attribute": "enzyme_production", "label": "Enzyme production"}, + {"attribute": "pathogenicity", "label": "pathogenicity"}, + {"attribute": "enzyme_production", "label": "enzymes"}, {"attribute": "production_of_metabolites", - "label": "Production of metabolites"}, - {"attribute": "applications", "label": "Applications", }, - {"attribute": "remarks", "label": "Remarks"}, + "label": "metabolites"}, + {"attribute": "type", + "label": "type"}, + {"attribute": "applications", "label": "applications", }, + {"attribute": "remarks", "label": "remarks"}, {"attribute": LITERATURE_LINKED_TO_SEQ_GENOME, - "label": "Literature linked to the sequence/genome"}, + "label": "sequenceLiterature"}, ] ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma", @@ -228,8 +240,9 @@ ALLOWED_MARKER_TYPES = [ ] PUBLICATIONS = "publications" -PUB_ID = "id" +PUB_ID = "pub_id" PUB_DOI = "pub_doi" +PUB_PMID = "pub_pmid" PUB_PUBMED_ID = '' PUB_FULL_REFERENCE = "full_reference" PUB_TITLE = "title" @@ -247,6 +260,8 @@ BOOK_PUBLISHER = "book_publisher" PUBLICATION_FIELDS = [ {"label": "ID", "attribute": PUB_ID}, + {"label": "PMID", "attribute": PUB_PMID}, + {"label": "DOI", "attribute": PUB_DOI}, {"label": "Full reference", "attribute": PUB_FULL_REFERENCE}, {"label": "Authors", "attribute": PUB_AUTHORS}, {"label": "Title", "attribute": PUB_TITLE}, @@ -282,15 +297,43 @@ SUBTAXAS = { "f.sp.": "forma.specialis" } +#Control +VERSION = "Version" +DATE = "Date" + + +#Country codes +COUNTRY = "Country" +CODE = "Code" +ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE = "Additional information on the country or code" + + +#Country codes files +COUNTRY_CODES_SHEET = [ + {"label": "Country", "attribute": COUNTRY}, + {"label": "Code", "attribute": CODE}, + {"label": "Additional information on the country or code", "attribute": ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE}, +] + + +#Controle files +CONTROL_FIELDS = [ + {"label": "Version", "attribute": VERSION}, + {"label": "Date", "attribute": DATE}, +] + # Excel sheet name LOCATIONS = "Geographic origin" # 'Locations' GROWTH_MEDIA = "Growth media" GENOMIC_INFO = "Genomic information" STRAINS = "Strains" LITERATURE_SHEET = "Literature" -SEXUAL_STATE_SHEET = "Sexual states" +SEXUAL_STATE_SHEET = "Sexual state" RESOURCE_TYPES_VALUES = "Resource types values" FORM_OF_SUPPLY_SHEET = "Forms of supply" PLOIDY_SHEET = "Ploidy" ONTOBIOTOPE = "Ontobiotope" MARKERS = "Markers" +CONTROL_SHEET = "Version" +COUNTRY_CODES_SHEET = "Country codes" +RESOURCE_SHEET = 'Resource types values' diff --git a/mirri/validation/entity_validators.py b/mirri/validation/entity_validators.py deleted file mode 100644 index e1e02d0..0000000 --- a/mirri/validation/entity_validators.py +++ /dev/null @@ -1,50 +0,0 @@ -from mirri import rgetattr - - -def validate_strain(strain, version='20200601'): - if version == '20200601': - return _validate_strain_v20200601(strain) - raise NotImplementedError('Only v20200601 is implemented') - - -def _validate_strain_v20200601(strain): - mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'}, - {'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'}, - {'label': 'Restriction on use', 'attr': 'restriction_on_use'}, - {'label': 'Risk group', 'attr': 'risk_group'}, - {'label': 'Organism type', 'attr': 'taxonomy.organism_type'}, - {'label': 'Taxon name', 'attr': 'taxonomy.long_name'}, - {'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'}, - {'label': 'Recommended media', 'attr': 'growth.recommended_media'}, - {'label': 'Form of supply', 'attr': 'form_of_supply'}, - {'label': 'Country', 'attr': 'collect.location.country'}] - - errors = [] - - for mandatory in mandatory_attrs: - value = rgetattr(strain, mandatory['attr']) - if value is None: - errors.append(f"{mandatory['label']} is mandatory field") - - if not is_valid_nagoya(strain): - errors.append('Not compliant wih nagoya protocol requirements') - - return errors - - -def is_valid_nagoya(strain): - # nagoya_requirements - _date = strain.collect.date - if _date is None: - _date = strain.isolation.date - if _date is None: - _date = strain.deposit.date - if _date is None: - _date = strain.catalog_inclusion_date - # print(_date) - year = None if _date is None else _date._year - - if year is not None and year >= 2014 and strain.collect.location.country is None: - return False - - return True diff --git a/mirri/validation/error_logging/error.py b/mirri/validation/error_logging/error.py index 2e65c8b..fc3d16b 100644 --- a/mirri/validation/error_logging/error.py +++ b/mirri/validation/error_logging/error.py @@ -62,6 +62,10 @@ class Entity(): def GID(self) -> str: return 'Genomic Information' + + def VRS(self) -> str: + return 'Version' + def OTD(self) -> str: return 'Ontobiotope' diff --git a/mirri/validation/error_logging/error_message.py b/mirri/validation/error_logging/error_message.py index 7188a9b..b51810b 100644 --- a/mirri/validation/error_logging/error_message.py +++ b/mirri/validation/error_logging/error_message.py @@ -92,6 +92,9 @@ class ErrorMessage(): def EFS08(self): return "The 'Genomic information' sheet is missing. Please check the provided excel template." + + def EFS09(self): + return "The 'Version' sheet is missing. Please check the provided excel template." """ Growth Media Error Codes @@ -147,26 +150,26 @@ class ErrorMessage(): def LID03(self): return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty." - def LID04(self): - return f"The 'Full reference' for literature with ID {self.pk} is missing." + #def LID04(self): + #return f"The 'Full reference' for literature with ID {self.pk} is missing." def LID05(self): return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty." - def LID06(self): - return f"The 'Authors' for literature with ID {self.pk} is missing." + #def LID06(self): + #return f"The 'Authors' for literature with ID {self.pk} is missing." def LID07(self): return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty." - def LID08(self): - return f"The 'Title' for literature with ID {self.pk} is missing." + #def LID08(self): + #return f"The 'Title' for literature with ID {self.pk} is missing." def LID09(self): return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty." - def LID10(self): - return f"The 'Journal' for literature with ID {self.pk} is missing." + #def LID10(self): + #return f"The 'Journal' for literature with ID {self.pk} is missing." def LID11(self): return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty." @@ -187,167 +190,191 @@ class ErrorMessage(): return f"The 'First page' for literature with ID {self.pk} is missing." def LID17(self): - msg = 'If journal; Title, Authors, journal, year and first page are required' - msg += 'If Book; Book Title, Authors, Year, Editors, Publishers' - return msg + return( f"There are four types of ways to fill in the 'Literature' sheet.", + "1st- Columns 'ID' and 'DOI' must be obrigatory.", + "2nd-Columns 'ID' and 'PMID' are obrigatory.", + "3rd-Columns 'ID' and 'Full reference' are obrigatory.", + "In the alternative of these three types of forms not being filled in, we have:", + "4th-Columns 'ID', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'.") + + def LID18(self): + return "The 'PMID' column is a mandatory field. The column can not be empty." + + #def LID19(self): + #return f"PMID for literature with ID {self.pk} is missing." + + def LID20(self): + return "The 'DOI' column is a mandatory field. The column can not be empty." + + #def LID21(self): + #return f"DOI for literature with ID {self.pk} is missing." """ Strains Error Codes """ - def STD01(self): - return "The 'Accession number' column is a mandatory field in the Strains sheet." + return "The 'accessionNumber' column is a mandatory field in the Strains sheet." def STD02(self): - return "The 'Accession number' column is empty or has missing values." + return "The 'accessionNumber' column is empty or has missing values." def STD03(self): - return f"The 'Accesion number' must be unique. The '{self.value}' is repeated." + return f"The 'accessionNumber' must be unique. The '{self.value}' is repeated." def STD04(self): - return (f"The 'Accession number' {self.pk} is not according to the specification." + return (f"The 'accessionNumber' {self.pk} is not according to the specification." " The value must be of the format ' '.") def STD05(self): - return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'useRestrictions' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD06(self): - return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing." + return f"The 'useRestrictions' for strain with accessionNumber {self.pk} is missing." def STD07(self): - return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification." + return (f"The 'useRestrictions' for strain with accessionNumber {self.pk} is not according to the specification." f" Your value is {self.value} and the accepted values are 1, 2, 3.") def STD08(self): - return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'nagoyaConditions' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD09(self): - return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing." + return f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is missing." def STD10(self): - return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification." + return (f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is not according to the specification." f" Your value is {self.value} and the accepted values are 1, 2, 3.") def STD11(self): - return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'registeredCollection' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2, 3.") def STD12(self): - return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty." + return "The 'riskGroup' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD13(self): - return f"The 'Risk group' for strain with Accession Number {self.pk} is missing." + return f"The 'riskGroup' for strain with accessionNumber {self.pk} is missing." def STD14(self): - return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'riskGroup' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.") def STD15(self): - return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'dualUse' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2.") def STD16(self): - return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification." + return (f"The “euQuarantine” for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2.") def STD17(self): - return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'organismType' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD18(self): - return f"The 'Organism type' for strain with Accession Number {self.pk} is missing." + return f"The 'organismType' for strain with accessionNumber {self.pk} is missing." def STD19(self): - return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'organismType' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', " "'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.") def STD20(self): - return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'speciesName' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD21(self): - return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing." + return f"The 'speciesName' for strain with accessionNumber {self.pk} is missing." def STD22(self): - return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect." + return f"The 'speciesName' for strain with accessionNumber {self.pk} is incorrect." def STD23(self): - return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'hybrid' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2.") def STD24(self): - return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'depositHistory' for strain with accessionNumber {self.pk} is incorrect." + "The field includes entries separated by '<' meaning 'received from'." + "Entries may include persons or CCs. The name of the CC should be followed by" + "the month, when available, and year of the acquisition. Between parentheses," + "the strain designation or CC numbers and/or a name can also be entered when " + "a name change has occurred.") def STD25(self): - return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'depositDate' for strain with accessionNumber {self.pk} is incorrect." " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") def STD26(self): - return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'accessionDate' for strain with accessionNumber {self.pk} is incorrect." " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") def STD27(self): - return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'collectionDate' for strain with accessionNumber {self.pk} is incorrect." " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") def STD28(self): - return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'isolationDate' for strain with accessionNumber {self.pk} is incorrect." " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") def STD29(self): - return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect." " It must have two decimal numbers separated by ','") def STD30(self): - return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'temperatureGrowthRange' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD31(self): - return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing." + return f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is missing." def STD32(self): - return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect." + return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect." " It must have two decimal numbers separated by ','.") def STD33(self): - return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty." + return ("The 'recommendedTemperature' column is a mandatory field in the Strains Sheet. The column can not be empty.") def STD34(self): - return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing." + return f"The 'recommendedTemperature' for strain with accessionNumber {self.pk} is missing." def STD35(self): - return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet." + return f"The value of 'recommendedTemperature' for strain with accessionNumber {self.pk} is not in the Growth Media Sheet." def STD36(self): - return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty." + return f"The 'supplyForms' column is a mandatory field in the Strains Sheet. The column can not be empty." def STD37(self): - return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing." + return f"The 'supplyForms' for strain with accessionNumber {self.pk} is missing." def STD38(self): - return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet." + return f"The value of 'supplyForms' for strain with accessionNumber {self.pk} is not in the Forms of Supply Sheet." def STD39(self): - return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect." - "The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be" - "between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.") + return (f"The 'geographicCoordinates' column for strain with accessionNumber {self.pk} is incorrect." + "The allowed formats are two, three or four decimal numbers separated by ','. Moreover, the first number must be." + "between [-90, 90], the second between [-180, 180], and the third and fourth refers to the precision and altitude, defined by decimal numbers." + "Put a question mark for lack of precision or altitude when one of them is missing. Leave the values blank when both are missing. ") def STD40(self): - return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect." + return (f"The 'country' column for strain with accessionNumber {self.pk} is incorrect." "The allowed formats are one decimal number between [-200, 8000].") + def STD54(self): + return (f"The 'country'column is a mandatory field in the Strains Sheet. The column can not be empty.") + def STD55(self): + return (f"The 'country' for strain with accessionNumber {self.pk} is missing.") def STD41(self): - return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet." + return f"The value of 'ontobiotopeTerms' for strain with accessionNumber {self.pk} is not in the Ontobiotope Sheet." def STD42(self): - return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'gmo' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 1, 2") def STD43(self): - return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'sexualState' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', " "'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'") def STD44(self): - return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification." + return (f"The 'ploidy' for strain with accessionNumber {self.pk} is not according to specification." f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9") def STD45(self): @@ -356,24 +383,97 @@ class ErrorMessage(): return msg def STD46(self): - msg = f"If date of collection/isolation/deposit/inclusion in the catalog is after 2014," \ - f" the value of column Geographic Origin must be provided and associated with a country in the " \ - f"Geographic Origin sheet. The value is missing or not associated with a country for strain {self.pk}." - return msg + return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is not according to specification." + f"The 'geographicOrigin' column must consist of the ID's associated with the Geographic origin sheet.") + def STD47(self): + return "The 'country' column is a mandatory field in the Strains sheet." + def STD48(self): + return "The 'country' column is empty or has missing values." + + def STD49(self): + return (f"The “qps” for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD50(self): + return (f"The “axenicCulture” for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Axenic', 'Not axenic'.") + + def STD51(self): + return f"The 'mirriAccessionNumber' must be unique. The '{self.pk}' is repeated." + + def STD52(self): + return (f"The 'mirriAccessionNumber' for strain with accessionNumber {self.pk} is incorrect." + " It must have the expression MIRRI followed by 7 digits") + + def STD53(self): + return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect." + " The displayed expression it should be composed of: site name ';' website url." ) + + def STD56(self): + return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. " ) + def STD57(self): + return (f"The 'country' for strain with accessionNumber {self.pk} is incorrect." + "This information must be expressed by using the ISO-3166 standard for country" + "codes. The preferred set is ISO 3166-1 alpha-2 (two letters code), but ISO 3166-" + "1 alpha-3 (three letters code) is also accepted. Former country codes must" + "follow standard’s part three ISO 3166-3 (four letters code). Only one code can" + "be included." ) + def STD58(self): + return (f"The 'mtaFile' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. " ) + def STD59(self): + return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect." + "The displayed expression it should be composed of: name ';' website url." + "When only one URL is provided, the title may be omitted. In this case, the URL" + "will be shown in clear to users." ) + def STD60(self): + return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. ") + def STD61(self): + return (f"The 'sequenceLiterature' for strain with accessionNumber {self.pk} is incorrect." + "Numeric identifiers separated by a semicolon ';'.") + + def STD62(self): + return (f"The 'plasmidCollections' for strain with accessionNumber {self.pk} is incorrect." + "It should include the name of the plasmid followed by the CC number in" + "parentheses. More than one plasmid can be reported, separated by ';'. " + "Plasmid names should be provided as free text." + "CC numbers should be composed by the CC acronym followed by a number" + "separated by a space'. Numeric identifiers separated by a semicolon ';'.") + + def STD63(self): + return (f"The 'otherCollectionNumbers' for strain with accessionNumber {self.pk} is incorrect." + " The value must be of the format ' '.") + + def STD64(self): + return (f"The 'type' for strain with accessionNumber {self.pk} is incorrect." + f"Your value is {self.value} and the accepted values are 1, 2.") + + def STD65(self): + return (f"The 'status' for strain with accessionNumber {self.pk} is incorrect." + "The structure should be 'type of .") + + def STD68(self): + return (f"The 'geographicOrigin'column is a mandatory field in the Strains Sheet. The column can not be empty.") + + def STD69(self): + return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is missing.") + """ Genomic Information Error Codes """ def GID01(self): - return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet." + return f"The 'Strain accessionNumber' (Strain AN) column is a mandatory field in the Genomic Information Sheet." def GID02(self): - return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values." + return f"The 'Strain accessionNumber' (Strain AN) column is empty or has missing values." def GID03(self): - return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet." + return f"The value of 'Strain accessionNumber' (Strain AN) {self.value} is not in the Strains sheet." def GID04(self): return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." @@ -397,6 +497,35 @@ class ErrorMessage(): return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." " It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.") + def GID11(self): + return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." + "An INSDC accession number is an alphanumeric" + "code made by a fixed number of letters followed by a fixed number of digits," + "without any separation. For sequences, the code is currently made of two" + "letters followed by six numbers.") + + + """ + Version Error Codes + """ + + def VRS01(self): + return "The 'Version' columns is a mandatory field in the Version Sheet." + + def VRS02(self): + return "The 'Version' columns is empty or has missing values." + + def VRS03(self): + return "The 'Date' columns is a mandatory field in the Control Sheet." + + def VRS04(self): + return "The 'Date' columns is empty or has missing values." + + def VRS05(self): + return f"The version {self.value} is the only one to be used." + + + """ Ontobiotope Error Codes """ @@ -407,8 +536,12 @@ class ErrorMessage(): def OTD02(self): return "The 'ID' columns is empty or has missing values." - def OTD03(self): + #def OTD03(self): return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty." - def OTD04(self): + #def OTD04(self): return f"The 'Name' for ontobiotope with ID {self.pk} is missing." + + + + \ No newline at end of file diff --git a/mirri/validation/excel_validator.py b/mirri/validation/excel_validator.py index 3b8e946..73ec3ad 100644 --- a/mirri/validation/excel_validator.py +++ b/mirri/validation/excel_validator.py @@ -4,27 +4,51 @@ from io import BytesIO from zipfile import BadZipfile from datetime import datetime from calendar import monthrange - +import requests from openpyxl import load_workbook +import pycountry from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet from mirri.validation.error_logging import ErrorLog, Error from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, ERROR_CODE, FIELD, MANDATORY, MATCH, - MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, - TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO) + MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, + TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE, + HISTORY,NAGOYA1, VERSION) from mirri.settings import LOCATIONS, SUBTAXAS -from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF +from mirri.validation.validation_conf_12052023 import version_config + +from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF -def validate_mirri_excel(fhand, version="20200601"): - if version == "20200601": - configuration = MIRRI_20200601_VALLIDATION_CONF +def validate_mirri_excel(fhand, version= "5.1.2" ): + if version == "5.1.2": + configuration = MIRRI_12052023_VALLIDATION_CONF else: - raise NotImplementedError("Only version20200601 is implemented") - + raise NotImplementedError("Only version 5.1.2 is implemented") + return validate_excel(fhand, configuration) +def version(value , validation_conf=None): + if value is None: + return True + try: + for version in version_config: + if value == version : + return True + except: + return False + + +def validate_country_code(value,validation_conf=None): + if value is None: + return True + try: + if pycountry.countries.get(alpha_2=value) or pycountry.countries.get(alpha_3=value) or pycountry.historic_countries.get(alpha_4 = value): + return True + except: + return False + def validate_excel(fhand, configuration): validation_conf = configuration['sheet_schema'] @@ -185,11 +209,14 @@ def validate_row(row, validation_steps, in_memory_sheets): kind = validation_step[TYPE] error_code = validation_step[ERROR_CODE] if kind == NAGOYA: - if not is_valid_nagoya(row, in_memory_sheets): + if not is_valid_nagoya_v12052023(row, in_memory_sheets): return error_code elif kind == BIBLIO: if not is_valid_pub(row): return error_code + elif kind == NAGOYA1: + if not is_valid_nago(row): + return error_code else: msg = f'{kind} is not a recognized row validation type method' raise NotImplementedError(msg) @@ -207,49 +234,70 @@ def validate_cell(value, validation_steps, crossrefs, shown_values, label): if error_code is not None: return error_code - - + def is_valid_pub(row): + pub_id = row.get('ID', None) + pub_pmid = row.get('PMID', None) + pub_doi = row.get('DOI', None) title = row.get('Title', None) full_reference = row.get('Full reference', None) authors = row.get('Authors', None) journal = row.get('Journal', None) year = row.get('Year', None) - volumen = row.get('Volumen', None) + volumen = row.get('Volume', None) first_page = row.get('First page', None) book_title = row.get('Book title', None) editors = row.get('Editors', None) publishers = row.get('Publishers', None) - if full_reference: + if (pub_id != None and pub_doi != None) or (pub_id != None and pub_pmid != None) or (pub_id != None and full_reference != None) or (pub_id != None and authors != None and title != None and journal != None and year != None and volumen != None and first_page != None) : return True is_journal = bool(title) - if (is_journal and (not authors or not journal or not not year or - not volumen or not first_page)): - return False - if (not is_journal and (not authors or not year or - not editors or not publishers or not book_title)): - return False + # if (is_journal and (not authors or not journal or not not year or + # not volumen or not first_page)): + # return False + #if (not is_journal and (not authors or not year or + # not editors or not publishers or not book_title)): + # return False + return False + +def is_valid_nago(row): + if not row: + return True + status = row.get("status", None) + type = row.get("type", None) + regex = r'^[a-zA-Z\s.\'-]+$' + + if status != None and type != None: + if (re.match(regex, status) and type==1): + return False + if (type == 2 and status is None): + return False return True +def parsee_mirri_excel(row, in_memory_sheets, version=""): + if version == "12052023": + return is_valid_nagoya_v12052023 (row, in_memory_sheets) + else: + raise NotImplementedError("Only version is implemented") -def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity - location_index = row.get('Geographic origin', None) +def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity + location_index = row.get('geographicOrigin', None) if location_index is None: country = None else: geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {}) country = geo_origin.get('Country', None) - _date = row.get("Date of collection", None) + _date = row.get("collectionDate", None) if _date is None: - _date = row.get("Date of isolation", None) + _date = row.get("isolationDate", None) if _date is None: - _date = row.get("Date of deposit", None) + _date = row.get("depositDate", None) if _date is None: - _date = row.get("Date of inclusion in the catalogue", None) + _date = row.get("accessionDate", None) if _date is not None: year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4]) else: @@ -258,9 +306,9 @@ def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity if year is not None and year >= 2014 and country is None: return False + return True - def is_valid_regex(value, validation_conf): if value is None: return True @@ -310,7 +358,9 @@ def is_valid_choices(value, validation_conf): values = [v.strip() for v in str(value).split(separator)] else: values = [str(value).strip()] - + sorted_values = sorted(values) + if sorted_values != values: + return False return all(value in choices for value in values) @@ -352,47 +402,145 @@ def is_valid_date(value, validation_conf): return True -def is_valid_coords(value, validation_conf=None): - # sourcery skip: return-identity +def is_valid_dominio(value, validation_conf=None): if value is None: return True try: items = [i.strip() for i in value.split(";")] - latitude = float(items[0]) - longitude = float(items[1]) - if len(items) > 2: - precision = float(items[2]) - if latitude < -90 or latitude > 90: - return False - if longitude < -180 or longitude > 180: - return False + if len(items) >1: + for i in range(0, len(items),2): + nameSite = str(items[i]) + urlSite = str(items[i+1]) + dominio = urlSite.split(".")[-2] + if nameSite.lower() != dominio: + return False + return True except: - return False + return False + +def is_valid_title(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + if len(items) >1: + for i in range(0, len(items),2): + nameSite = (items[i]) + urlSite = str(items[i+1]) + regex = r'^(http|https):\/\/[a-z0-9\-\.]+\.[a-z]{2,}([/a-z0-9\-\.]*)*$' + if re.match(regex, nameSite) or isinstance(nameSite, int) or nameSite == '': + return False + return True + except: + return False + +def is_valid_url_title(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + if len(items) ==1: + urlSite = str(items[0]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + else: + items = [i.strip() for i in value.split(";")] + for i in range(0, len(items),2): + nameSite = (items[i]) + urlSite = str(items[i+1]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + + return True + except: + return False + + +def is_valid_url_dominio(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + for i in range(0, len(items),2): + nameSite = str(items[i]) + urlSite = str(items[i+1]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + return True + except: + return False + + +def is_valid_just_url(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + for i in items: + nameSite = str(items[0]) + response = requests.head(i) + if response.status_code != 200: + return False + + return True + except: + return False + + +def is_valid_history(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split("<")] + for i in items: + regex1 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*((19|20)\d{2})' + regex2 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*[a-zA-Z0-9 &,;.''-] (19|20)\d{2}\s\([a-zA-Z0-9 &,;.''-:]+\)' + regex3 = r'^[a-zA-Z0-9 &,;.:''-]+\,?\s*[a-zA-Z0-9 &,;.''-]' + regex4 = r'^[a-zA-Z0-9 &,;.''-]+,?\s*(19|20)\d{2}\s\([a-zA-Z0-9 .''-,;&:]+\)' + regex5 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*\([a-zA-Z0-9 &,;.''-:]+\) (19|20)\d{2}' + if re.match(regex1, i): + return True + elif re.match(regex2, i): + return True + elif re.match(regex3, i): + return True + elif re.match(regex4, i): + return True + elif re.match(regex5, i): + return True + else: + return False + except: + return False + + +def is_valid_coords(value, validation_conf=None): + if value is None: + return True + try: + + regex1 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)(\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?))*$' + regex2 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?)\s*;\s*(\d+\.\d+|\?)\s*;\s*(\d+\.\d+|\?)$|^(\d+\.\d+|\?)$|^\s*;\s*$' + + if not re.match(regex1, value) and not re.match(regex2, value): + return False + + return True + except: + return False def is_valid_missing(value, validation_conf=None): return value is not None -def is_valid_number(value, validation_conf): - if value is None: - return True - try: - value = float(value) - except TypeError: - return False - except ValueError: - return False - - _max = validation_conf.get('max', None) - _min = validation_conf.get('min', None) - if (_max is not None and value > _max) or (_min is not None and value < _min): - return False - - return True - - def is_valid_taxon(value, validation_conf=None): multiple = validation_conf.get(MULTIPLE, False) separator = validation_conf.get(SEPARATOR, ';') @@ -429,6 +577,8 @@ def _is_valid_taxon(value): def is_valid_unique(value, validation_conf): + if not value: + return True label = validation_conf['label'] shown_values = validation_conf['shown_values'] if label not in shown_values: @@ -444,7 +594,6 @@ def is_valid_unique(value, validation_conf): return True - def is_valid_file(path): try: with path.open("rb") as fhand: @@ -464,8 +613,15 @@ VALIDATION_FUNCTIONS = { CROSSREF: is_valid_crossrefs, DATE: is_valid_date, COORDINATES: is_valid_coords, - NUMBER: is_valid_number, TAXON: is_valid_taxon, + TITLE: is_valid_title, + DOMINIO: is_valid_dominio, + URL_TITLE: is_valid_url_title, + URL_DOMINIO: is_valid_url_dominio, + JUST_URL: is_valid_just_url, + ISO: validate_country_code, + HISTORY: is_valid_history, + VERSION: version, UNIQUE: is_valid_unique} diff --git a/mirri/validation/tags.py b/mirri/validation/tags.py index ef036c9..9fb35e0 100644 --- a/mirri/validation/tags.py +++ b/mirri/validation/tags.py @@ -16,9 +16,20 @@ MATCH = 'match' VALUES = 'values' DATE = 'date' COORDINATES = 'coord' +COORDINATES1 = 'coord1' NUMBER = 'number' TAXON = 'taxon' UNIQUE = 'unique' ROW_VALIDATION = 'row_validation' NAGOYA = 'nagoya' BIBLIO = 'bibliography' +DOMINIO= 'is_valid_dominio' +TITLE= 'is_valid_title' +URL_DOMINIO = 'urll_valid_dominio' +URL_TITLE= 'is_valid_url_title' +ISO = 'validate_country_code' +JUST_URL= 'is_valid_just_url' +HISTORY= 'is_valid_history' +MEU='is_valid_crossrefs_meu' +NAGOYA1 = 'nayoga1' +VERSION = 'version' \ No newline at end of file diff --git a/mirri/validation/validate_v5.py b/mirri/validation/validate_v5.py new file mode 100644 index 0000000..290fc00 --- /dev/null +++ b/mirri/validation/validate_v5.py @@ -0,0 +1,25 @@ +#!/usr/bin/env python +import pandas as pd +import sys +from pathlib import Path +import warnings +warnings.simplefilter("ignore") +from mirri.validation.excel_validator import validate_mirri_excel + +def main(): + path = Path(sys.argv[1]) + version = str(sys.argv[2]) + try: + + error_log = validate_mirri_excel(path.open("rb"), version=version) + + except NotImplementedError as e: + print(e) + + for errors in error_log.get_errors().values(): + for error in errors: + print(error.pk, error.message, error.code) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/mirri/validation/validation_conf_20200601.py b/mirri/validation/validation_conf_12052023.py similarity index 65% rename from mirri/validation/validation_conf_20200601.py rename to mirri/validation/validation_conf_12052023.py index 1d9752c..2870e8f 100644 --- a/mirri/validation/validation_conf_20200601.py +++ b/mirri/validation/validation_conf_12052023.py @@ -1,10 +1,13 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, ERROR_CODE, FIELD, MANDATORY, MATCH, - MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, - UNIQUE, - VALIDATION, VALUES, BIBLIO) + MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, + UNIQUE,VERSION, + VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1) from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, - STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS) + STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET) + + + # GEOGRAPHIC_ORIGIN # SEXUAL_STATE_SHEET, # RESOURCE_TYPES_VALUES, @@ -12,9 +15,12 @@ from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, # PLOIDY_SHEET) + + STRAIN_FIELDS = [ + { - FIELD: "Accession number", + FIELD: "accessionNumber", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: 'STD01'}, {TYPE: UNIQUE, ERROR_CODE: 'STD03'}, @@ -23,16 +29,24 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "Restrictions on use", + FIELD: "useRestrictions", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD05"}, - {TYPE: MISSING, ERROR_CODE: "STD06"}, + {TYPE: MISSING, ERROR_CODE: "STD06"}, {TYPE: CHOICES, VALUES: ["1", "2", "3"], MULTIPLE: False, ERROR_CODE: "STD07"} ] }, + { + FIELD: "mirriAccessionNumber", + VALIDATION: [ + {TYPE: UNIQUE, ERROR_CODE: 'STD51'}, + {TYPE: REGEXP, MATCH: "^MIRRI[0-9]{7}$", ERROR_CODE: "STD52"}, + ], + }, + { - FIELD: "Nagoya protocol restrictions and compliance conditions", + FIELD: "nagoyaConditions", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD08"}, {TYPE: MISSING, ERROR_CODE: "STD09"}, @@ -41,29 +55,53 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "ABS related files", - VALIDATION: [], + FIELD: "absFile", + VALIDATION: [ + {TYPE: TITLE, ERROR_CODE: "STD59"}, + {TYPE: URL_TITLE, ERROR_CODE: "STD60", + MULTIPLE: True, SEPARATOR: ";"}, + ], + }, + + { + FIELD: "siteLinks", + VALIDATION: [ + {TYPE: DOMINIO, ERROR_CODE: "STD53", + MULTIPLE: False, SEPARATOR: ";"}, + {TYPE: URL_DOMINIO, ERROR_CODE: "STD56", + MULTIPLE: False, SEPARATOR: ";"}, + ], }, { - FIELD: "MTA file", - VALIDATION: [], + FIELD: "mtaFile", + VALIDATION: [ + {TYPE: JUST_URL, ERROR_CODE: "STD58", + MULTIPLE: True, SEPARATOR: ";"}, + ], }, { - FIELD: "Other culture collection numbers", - # VALIDATION: [ - # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07", - # MULTIPLE: True, SEPARATOR: ";"} - # ] + FIELD: "otherCollectionNumbers", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "([^ ]* [^ ]*)(; [^ ]* [^ ]*)*$", ERROR_CODE: "STD63", + MULTIPLE: True, SEPARATOR: ';'}, + #{TYPE: CROSSREF, CROSSREF_NAME: "Strains", ERROR_CODE: "STD64"}, + ] }, { - FIELD: "Strain from a registered collection", + FIELD: "registeredCollection", VALIDATION: [ {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD11"} ] + }, + { + FIELD: "type", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD64"}, + ] }, { - FIELD: "Risk Group", + FIELD: "riskGroup", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD12"}, {TYPE: MISSING, ERROR_CODE: "STD13"}, @@ -72,33 +110,41 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "Dual use", + FIELD: "dualUse", VALIDATION: [ {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD15"} ] }, { - FIELD: "Quarantine in Europe", + FIELD: "euQuarantine", VALIDATION: [ {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD16"} ] }, + { + FIELD: "axenicCulture", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["Axenic", "Not axenic"], + ERROR_CODE: "STD50"} + ] + }, + { - FIELD: "Organism type", + FIELD: "organismType", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD17"}, {TYPE: MISSING, ERROR_CODE: "STD18"}, {TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria", - "Cyanobacteria", "Filamentous Fungi", - "Phage", "Plasmid", "Virus", "Yeast", - "1", "2", "3", "4", "5", "6", "7", "8", "9"], + "Cyanobacteria", "Filamentous Fungi", "Filamentous fungi", + "Yeast", "Microalgae", + "1", "2", "3", "4", "5", "6", "7"], MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"} ] }, { - FIELD: "Taxon name", + FIELD: "speciesName", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD20"}, {TYPE: MISSING, ERROR_CODE: "STD21"}, @@ -107,73 +153,69 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "Infrasubspecific names", + FIELD: "infrasubspecificNames", + VALIDATION: [] }, { - FIELD: "Comment on taxonomy", + FIELD: "taxonomyComments", + VALIDATION: [] }, { - FIELD: "Interspecific hybrid", + FIELD: "hybrid", VALIDATION: [ {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD23"} ] }, { - FIELD: "Status", + FIELD: "status", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "^(type of|neotype of|holotype of |epitype of) ([a-zA-Z .'-]+)$", ERROR_CODE: "STD65"}, + + ] + }, { - FIELD: "History of deposit", + FIELD: "depositHistory", VALIDATION: [ - # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex - # MULTIPLE: True, SEPARATOR: ";"} + {TYPE: HISTORY, ERROR_CODE: 'STD24'}, ] }, { - FIELD: "Depositor" + FIELD: "depositor", + VALIDATION: [] }, { - FIELD: "Date of deposit", + FIELD: "depositDate", VALIDATION: [ {TYPE: DATE, ERROR_CODE: "STD25"}, ] }, { - FIELD: "Date of inclusion in the catalogue", + FIELD: "accessionDate", VALIDATION: [ {TYPE: DATE, ERROR_CODE: "STD26"}, ] }, { - FIELD: "Collected by", + FIELD: "collector", + VALIDATION: [] + }, + + + { + FIELD: "substrate", + VALIDATION: [] }, { - FIELD: "Date of collection", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD27"}, - ] - }, - { - FIELD: "Isolated by", - }, - { - FIELD: "Date of isolation", - VALIDATION: [ - {TYPE: DATE, ERROR_CODE: "STD28"}, - ] - }, - { - FIELD: "Substrate/host of isolation", - }, - { - FIELD: "Tested temperature growth range", + FIELD: "temperatureGrowthRange", VALIDATION: [ {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"} ] }, { - FIELD: "Recommended growth temperature", + FIELD: "recommendedTemperature", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD30"}, {TYPE: MISSING, ERROR_CODE: "STD31"}, @@ -182,17 +224,9 @@ STRAIN_FIELDS = [ MULTIPLE: True, SEPARATOR: ";"} ] }, + { - FIELD: "Recommended medium for growth", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "STD33"}, - {TYPE: MISSING, ERROR_CODE: "STD34"}, - {TYPE: CROSSREF, CROSSREF_NAME: "Growth media", - MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"} - ] - }, - { - FIELD: "Form of supply", + FIELD: "supplyForms", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "STD36"}, {TYPE: MISSING, ERROR_CODE: "STD37"}, @@ -202,52 +236,70 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "Other denomination", + FIELD: "otherDenomination", + VALIDATION: [] }, { - FIELD: "Coordinates of geographic origin", + FIELD: "geographicCoordinates", VALIDATION: [ {TYPE: COORDINATES, ERROR_CODE: "STD39"}, + ] }, - { - FIELD: "Altitude of geographic origin", - VALIDATION: [ - {TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"}, - ] - }, + { # value can be in the cell or in another sheet. Don't configure this - FIELD: "Geographic origin", + FIELD: "geographicOrigin", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD68"}, + {TYPE: MISSING, ERROR_CODE: "STD69"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Geographic origin", ERROR_CODE: "STD46"}, + ] + }, + + { + FIELD: "isolationHabitat", + VALIDATION: [] }, { - FIELD: "Isolation habitat", - }, - { - FIELD: "Ontobiotope term for the isolation habitat", + FIELD: "ontobiotopeTerms", VALIDATION: [ {TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope", MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"} ] + }, + { + FIELD: "qps", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD49"} + ] }, { - FIELD: "GMO", + FIELD: "gmo", VALIDATION: [ {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD42"} ] }, { - FIELD: "GMO construction information", + FIELD: "gmoConstruction", + VALIDATION: [] }, { - FIELD: "Mutant information", + FIELD: "mutant", + VALIDATION: [] }, { - FIELD: "Genotype", + FIELD: "genotype", + VALIDATION: [] }, { - FIELD: "Sexual state", + FIELD: "Plant pathogenicity code", + VALIDATION: [] + }, + { + FIELD: "sexualState", VALIDATION: [ {TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET, ERROR_CODE: "STD43"} @@ -258,46 +310,78 @@ STRAIN_FIELDS = [ ] }, { - FIELD: "Ploidy", + FIELD: "ploidy", VALIDATION: [ - {TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"], + {TYPE: CHOICES, VALUES: ["1", "2", "3", "4", "5", "9"], ERROR_CODE: "STD44"} ] }, { - FIELD: "Plasmids", + FIELD: "plasmids", + VALIDATION: [] }, { - FIELD: "Plasmids collections fields", + FIELD: "plasmidCollections", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$", + ERROR_CODE: "STD62"} + ] }, { # value can be in the cell or in another sheet. Don't configure this - FIELD: "Literature", + FIELD: "identificationLiterature", VALIDATION: [ {TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET, MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"} ] }, { - FIELD: "Plant pathogenicity code", + FIELD: "pathogenicity", + VALIDATION: [] }, { - FIELD: "Pathogenicity", + FIELD: "enzymes", + VALIDATION: [] }, { - FIELD: "Enzyme production", + FIELD: "metabolites", + VALIDATION: [] }, { - FIELD: "Production of metabolites", + FIELD: "applications", + VALIDATION: [] }, { - FIELD: "Applications", + FIELD: "remarks", + VALIDATION: [] }, { - FIELD: "Remarks" + FIELD: "sequenceLiterature", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"}, + ] + }, - { - FIELD: "Literature linked to the sequence/genome", + + { + FIELD: "recommendedMedium", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD33"}, + {TYPE: MISSING, ERROR_CODE: "STD34"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Growth media", + MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"} + ] + }, + + + { + FIELD: "country", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD54"}, + {TYPE: MISSING, ERROR_CODE: "STD55"}, + {TYPE: ISO, ERROR_CODE: "STD57"}, + #{TYPE: CROSSREF, CROSSREF_NAME: COUNTRY_CODES_SHEET, ERROR_CODE: "STD57"} + ] }, ] SHEETS_SCHEMA = { @@ -317,7 +401,7 @@ SHEETS_SCHEMA = { FIELD: "Country", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "GOD03"}, - {TYPE: MISSING, ERROR_CODE: "GOD04"} + {TYPE: MISSING, ERROR_CODE: "GOD04"}, ] }, { @@ -389,6 +473,7 @@ SHEETS_SCHEMA = { VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "GID07"}, {TYPE: MISSING, ERROR_CODE: "GID08"}, + {TYPE: REGEXP, MATCH: "^[A-Z]{2}[0-9]{6}$", ERROR_CODE: "GID11"}, ] }, { @@ -399,11 +484,9 @@ SHEETS_SCHEMA = { }, STRAINS: { "acronym": "STD", - 'id_field': 'Accession number', + 'id_field': 'accessionNumber', VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"}, - ROW_VALIDATION: [ - {TYPE: NAGOYA, ERROR_CODE: "STD46"}, - ], + ROW_VALIDATION: [], COLUMNS: STRAIN_FIELDS, }, LITERATURE_SHEET: { @@ -412,7 +495,7 @@ SHEETS_SCHEMA = { VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"}, ROW_VALIDATION: [ {TYPE: BIBLIO, ERROR_CODE: 'LID17'} - ], + ], COLUMNS: [ { FIELD: "ID", @@ -421,6 +504,18 @@ SHEETS_SCHEMA = { {TYPE: MISSING, ERROR_CODE: "LID02"}, ] }, + { + FIELD: "PMID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID18"}, + ] + }, + { + FIELD: "DOI", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID20"}, + ] + }, { FIELD: "Full reference", VALIDATION: [ @@ -465,7 +560,6 @@ SHEETS_SCHEMA = { FIELD: "First page", VALIDATION: [ {TYPE: MANDATORY, ERROR_CODE: "LID15"}, - {TYPE: MISSING, ERROR_CODE: "LID16"}, ] }, { @@ -504,13 +598,38 @@ SHEETS_SCHEMA = { }, { FIELD: "Name", - VALIDATION: [ - {TYPE: MANDATORY, ERROR_CODE: "OTD03"}, - {TYPE: MISSING, ERROR_CODE: "OTD04"}, - ] + VALIDATION: [] }, ] }, + + + + + + CONTROL_SHEET: { + "acronym": "VRS", + "id_field": "Version", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS09"}, + COLUMNS: [ + { + FIELD: "Version", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "VRS01"}, + {TYPE: MISSING, ERROR_CODE: "VRS02"}, + {TYPE: VERSION, ERROR_CODE: "VRS05"}, + ] + }, + { + FIELD: "Date", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "VRS03"}, + {TYPE: MISSING, ERROR_CODE: "VRS04"}, + ] + }, + ] + }, + MARKERS: { "acronym": "MKD", "id_field": "Acronym", @@ -524,22 +643,31 @@ SHEETS_SCHEMA = { VALIDATION: [] }, ], - }, + }, } + CROSS_REF_CONF = { - ONTOBIOTOPE: ['ID', 'Name'], - LITERATURE_SHEET: ['ID'], - LOCATIONS: ['Locality'], + ONTOBIOTOPE: ['ID'], + LITERATURE_SHEET: ['ID', 'DOI', 'PMID', 'Full reference', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'], + LOCATIONS: ['ID', 'Locality'], GROWTH_MEDIA: ['Acronym'], - STRAINS: ["Accession number"], + STRAINS: ["accessionNumber"], SEXUAL_STATE_SHEET: [], MARKERS: ["Acronym"], + } -MIRRI_20200601_VALLIDATION_CONF = { +MIRRI_12052023_VALLIDATION_CONF = { 'sheet_schema': SHEETS_SCHEMA, 'cross_ref_conf': CROSS_REF_CONF, 'keep_sheets_in_memory': [ {'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}] } + +version_config = { + '5.1.2': MIRRI_12052023_VALLIDATION_CONF, + 'date': '12/05/2023' + +} +