Compare commits

..

No commits in common. "master" and "master" have entirely different histories.

11 changed files with 361 additions and 829 deletions

View File

@ -37,14 +37,14 @@ TRUEFALSE_TRANSLATOR = {
}
def parse_mirri_excel(fhand, version=""):
if version == "5.1.2":
return _parse_mirri_v12052023(fhand)
def parse_mirri_excel(fhand, version="20200601"):
if version == "20200601":
return _parse_mirri_v20200601(fhand)
else:
raise NotImplementedError("Only version is 5.1.2 implemented")
raise NotImplementedError("Only version 20200601 is implemented")
def _parse_mirri_v12052023(fhand):
def _parse_mirri_v20200601(fhand):
fhand.seek(0)
file_content = BytesIO(fhand.read())
wb = load_workbook(filename=file_content, read_only=True, data_only=True)
@ -64,6 +64,7 @@ def _parse_mirri_v12052023(fhand):
return {"strains": strains, "growth_media": growth_media}
def index_list_by(list_, id_):
return {str(item[id_]): item for item in list_}
@ -124,7 +125,7 @@ def parse_strains(wb, locations, growth_media, markers, publications,
publications = index_list_by_attr(publications, 'id')
markers = index_markers(markers)
for strain_row in workbook_sheet_reader(wb, STRAINS, "accessionNumber"):
for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"):
strain = StrainMirri()
strain_id = None
label = None
@ -201,18 +202,8 @@ def parse_strains(wb, locations, growth_media, markers, publications,
items = value.split(";")
strain.collect.location.latitude = float(items[0])
strain.collect.location.longitude = float(items[1])
strain.collect.location.precision = float(items[2])
strain.collect.location.altitude = float(items[3])
if len(items) > 4:
strain.collect.location.coord_uncertainty = items[4]
elif attribute == "collect.site.links":
items = value.split(";")
strain.collect.site.links.nameSite = str(items[0])
strain.collect.site.links.urlSite = str(items[1])
rsetattr(strain, attribute, value.split(";")) #ver o separador
if len(items) > 2:
strain.collect.site.links.site_uncertainty = items[2]
strain.collect.location.coord_uncertainty = items[2]
elif attribute == "collect.location":
location = locations[value]

View File

@ -50,10 +50,11 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS]
def write_mirri_excel(path, strains, growth_media, version):
if version == "5.1.2":
_write_mirri_excel_12052023(path, strains, growth_media)
if version == "20200601":
_write_mirri_excel_20200601(path, strains, growth_media)
def _write_mirri_excel_12052023(path, strains, growth_media):
def _write_mirri_excel_20200601(path, strains, growth_media):
wb = Workbook()
write_markers_sheet(wb)
@ -103,7 +104,7 @@ def _write_mirri_excel_12052023(path, strains, growth_media):
redimension_cell_width(pub_sheet)
# write sexual states
sex_sheet = wb.create_sheet("Sexual state")
sex_sheet = wb.create_sheet("Sexual states")
for sex_state in sorted(list(sexual_states)):
sex_sheet.append([sex_state])
redimension_cell_width(sex_sheet)
@ -120,6 +121,7 @@ def _write_mirri_excel_12052023(path, strains, growth_media):
del wb["Sheet"]
wb.save(str(path))
def _deserialize_strains(strains, locations, growth_media_indexes,
publications, sexual_states, genomic_markers):
for strain in strains:
@ -187,19 +189,8 @@ def _deserialize_strains(strains, locations, growth_media_indexes,
elif attribute == "collect.location.coords":
lat = strain.collect.location.latitude
long = strain.collect.location.longitude
alt = strain.collect.location.altitude
prec = strain.collect.location.precision
if lat is not None and long is not None and prec is not None and alt is not None:
value = f"{lat};{long};{prec};{alt}"
else:
value = None
elif attribute == "collect.site.links":
name = strain.collect.site.links.nameSite
url = strain.collect.site.links.urlSite
value = rgetattr(strain, attribute)
value = ";".join(value)
if name is not None and url is not None:
value = f"{name};{url}"
if lat is not None and long is not None:
value = f"{lat};{long}"
else:
value = None

View File

@ -3,7 +3,6 @@ from pathlib import Path
DATA_DIR = Path(__file__).parent / "data"
ACCESSION_NUMBER = "accession_number"
MIRRI_ACCESSION_NUMBER = 'mirri_accession_number'
RESTRICTION_ON_USE = "restriction_on_use"
NAGOYA_PROTOCOL = "nagoya_protocol"
ABS_RELATED_FILES = "abs_related_files"
@ -15,7 +14,6 @@ DUAL_USE = "dual_use"
QUARANTINE = "quarantine"
ORGANISM_TYPE = "organism_type"
TAXON_NAME = "taxon_name"
TYPE = "type"
INFRASUBSPECIFIC_NAME = "infrasubspecific_names"
COMMENTS_ON_TAXONOMY = "comments_on_taxonomy"
STATUS = "status"
@ -56,9 +54,6 @@ SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation"
ISOLATION_HABITAT = "isolation_habitat"
ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat"
LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome"
AXENIC_CULTURE = "axenic_culture"
QPS ="qps"
SITE_LINK = "site_links"
# StrainId
STRAIN_ID = "id"
@ -104,80 +99,73 @@ ALLOWED_COLLECTING_SITE_KEYS = [
]
MIRRI_FIELDS = [
{"attribute": "id", "label": "accessionNumber"},
{"attribute": "mirri_accession_number", "label": "mirriAccessionNumber"},
{"attribute": "qps", "label": "qps"},
{"attribute": "axenic_culture", "label": "axenicCulture"},
{"attribute": "restriction_on_use", "label": "useRestrictions"},
{"attribute": "id", "label": "Accession number"},
{"attribute": "restriction_on_use", "label": "Restrictions on use"},
{"attribute": "nagoya_protocol",
"label": "nagoyaConditions"},
{"attribute": ABS_RELATED_FILES, "label": "absFile"},
{"attribute": "mta_files", "label": "mtaFile"},
{"attribute": "other_numbers", "label": "otherCollectionNumbers"},
"label": "Nagoya protocol restrictions and compliance conditions"},
{"attribute": ABS_RELATED_FILES, "label": "ABS related files"},
{"attribute": "mta_files", "label": "MTA file"},
{"attribute": "other_numbers", "label": "Other culture collection numbers"},
{"attribute": "is_from_registered_collection",
"label": "registeredCollection"},
{"attribute": "risk_group", "label": "riskGroup"},
{"attribute": "is_potentially_harmful", "label": "dualUse"},
{"attribute": "is_subject_to_quarantine", "label": "euQuarantine"},
{"attribute": "taxonomy.organism_type", "label": "organismType"},
{"attribute": "taxonomy.taxon_name", "label": "speciesName"},
"label": "Strain from a registered collection"},
{"attribute": "risk_group", "label": "Risk Group"},
{"attribute": "is_potentially_harmful", "label": "Dual use"},
{"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"},
{"attribute": "taxonomy.organism_type", "label": "Organism type"},
{"attribute": "taxonomy.taxon_name", "label": "Taxon name"},
{"attribute": "taxonomy.infrasubspecific_name",
"label": "infrasubspecificNames"},
{"attribute": "taxonomy.comments", "label": "taxonomyComments"},
"label": "Infrasubspecific names"},
{"attribute": "taxonomy.comments", "label": "Comment on taxonomy"},
{"attribute": "taxonomy.interspecific_hybrid",
"label": "hybrid"},
{"attribute": "status", "label": "status"},
{"attribute": "history", "label": "depositHistory", },
{"attribute": "deposit.who", "label": "depositor"},
{"attribute": "deposit.date", "label": "depositDate"},
"label": "Interspecific hybrid"},
{"attribute": "status", "label": "Status"},
{"attribute": "history", "label": "History of deposit", },
{"attribute": "deposit.who", "label": "Depositor"},
{"attribute": "deposit.date", "label": "Date of deposit"},
{"attribute": "catalog_inclusion_date",
"label": "accessionDate"},
{"attribute": "collect.who", "label": "collector"},
{"attribute": "collect.date", "label": "collectionDate"},
{"attribute": "isolation.who", "label": "isolator"},
{"attribute": "isolation.date", "label": "isolationDate"},
"label": "Date of inclusion in the catalogue"},
{"attribute": "collect.who", "label": "Collected by"},
{"attribute": "collect.date", "label": "Date of collection"},
{"attribute": "isolation.who", "label": "Isolated by"},
{"attribute": "isolation.date", "label": "Date of isolation"},
{"attribute": "isolation.substrate_host_of_isolation",
"label": "substrate"},
"label": "Substrate/host of isolation"},
{"attribute": "growth.tested_temp_range",
"label": "temperatureGrowthRange"},
"label": "Tested temperature growth range"},
{"attribute": "growth.recommended_temp",
"label": "recommendedTemperature"},
"label": "Recommended growth temperature"},
{"attribute": "growth.recommended_media",
"label": "recommendedMedium"},
{"attribute": "form_of_supply", "label": "supplyForms"},
{"attribute": "other_denominations", "label": "otherDenomination"},
"label": "Recommended medium for growth"},
{"attribute": "form_of_supply", "label": "Form of supply"},
{"attribute": "other_denominations", "label": "Other denomination"},
{"attribute": "collect.location.coords",
"label": "geographicCoordinates"},
{"attribute": "collect.site.links",
"label": "siteLinks"},
"label": "Coordinates of geographic origin"},
{"attribute": "collect.location.altitude",
"label": "country"},
{"attribute": "collect.location", "label": "geographicOrigin"},
{"attribute": "collect.habitat", "label": "isolationHabitat"},
"label": "Altitude of geographic origin"},
{"attribute": "collect.location", "label": "Geographic origin"},
{"attribute": "collect.habitat", "label": "Isolation habitat"},
{"attribute": "collect.habitat_ontobiotope",
"label": "ontobiotopeTerms"},
{"attribute": "genetics.gmo", "label": "gmo"},
"label": "Ontobiotope term for the isolation habitat"},
{"attribute": "genetics.gmo", "label": "GMO"},
{"attribute": "genetics.gmo_construction",
"label": "gmoConstruction"},
{"attribute": "genetics.mutant_info", "label": "mutant"},
{"attribute": "genetics.genotype", "label": "genotype"},
{"attribute": "genetics.sexual_state", "label": "sexualState"},
{"attribute": "genetics.ploidy", "label": "ploidy"},
{"attribute": "genetics.plasmids", "label": "plasmids"},
"label": "GMO construction information"},
{"attribute": "genetics.mutant_info", "label": "Mutant information"},
{"attribute": "genetics.genotype", "label": "Genotype"},
{"attribute": "genetics.sexual_state", "label": "Sexual state"},
{"attribute": "genetics.ploidy", "label": "Ploidy"},
{"attribute": "genetics.plasmids", "label": "Plasmids"},
{"attribute": "genetics.plasmids_in_collections",
"label": "plasmidCollections"},
{"attribute": "publications", "label": "identificationLiterature"},
"label": "Plasmids collections fields"},
{"attribute": "publications", "label": "Literature"},
{"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"},
{"attribute": "pathogenicity", "label": "pathogenicity"},
{"attribute": "enzyme_production", "label": "enzymes"},
{"attribute": "pathogenicity", "label": "Pathogenicity"},
{"attribute": "enzyme_production", "label": "Enzyme production"},
{"attribute": "production_of_metabolites",
"label": "metabolites"},
{"attribute": "type",
"label": "type"},
{"attribute": "applications", "label": "applications", },
{"attribute": "remarks", "label": "remarks"},
"label": "Production of metabolites"},
{"attribute": "applications", "label": "Applications", },
{"attribute": "remarks", "label": "Remarks"},
{"attribute": LITERATURE_LINKED_TO_SEQ_GENOME,
"label": "sequenceLiterature"},
"label": "Literature linked to the sequence/genome"},
]
ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma",
@ -240,9 +228,8 @@ ALLOWED_MARKER_TYPES = [
]
PUBLICATIONS = "publications"
PUB_ID = "pub_id"
PUB_ID = "id"
PUB_DOI = "pub_doi"
PUB_PMID = "pub_pmid"
PUB_PUBMED_ID = ''
PUB_FULL_REFERENCE = "full_reference"
PUB_TITLE = "title"
@ -260,8 +247,6 @@ BOOK_PUBLISHER = "book_publisher"
PUBLICATION_FIELDS = [
{"label": "ID", "attribute": PUB_ID},
{"label": "PMID", "attribute": PUB_PMID},
{"label": "DOI", "attribute": PUB_DOI},
{"label": "Full reference", "attribute": PUB_FULL_REFERENCE},
{"label": "Authors", "attribute": PUB_AUTHORS},
{"label": "Title", "attribute": PUB_TITLE},
@ -297,43 +282,15 @@ SUBTAXAS = {
"f.sp.": "forma.specialis"
}
#Control
VERSION = "Version"
DATE = "Date"
#Country codes
COUNTRY = "Country"
CODE = "Code"
ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE = "Additional information on the country or code"
#Country codes files
COUNTRY_CODES_SHEET = [
{"label": "Country", "attribute": COUNTRY},
{"label": "Code", "attribute": CODE},
{"label": "Additional information on the country or code", "attribute": ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE},
]
#Controle files
CONTROL_FIELDS = [
{"label": "Version", "attribute": VERSION},
{"label": "Date", "attribute": DATE},
]
# Excel sheet name
LOCATIONS = "Geographic origin" # 'Locations'
GROWTH_MEDIA = "Growth media"
GENOMIC_INFO = "Genomic information"
STRAINS = "Strains"
LITERATURE_SHEET = "Literature"
SEXUAL_STATE_SHEET = "Sexual state"
SEXUAL_STATE_SHEET = "Sexual states"
RESOURCE_TYPES_VALUES = "Resource types values"
FORM_OF_SUPPLY_SHEET = "Forms of supply"
PLOIDY_SHEET = "Ploidy"
ONTOBIOTOPE = "Ontobiotope"
MARKERS = "Markers"
CONTROL_SHEET = "Version"
COUNTRY_CODES_SHEET = "Country codes"
RESOURCE_SHEET = 'Resource types values'

View File

@ -0,0 +1,50 @@
from mirri import rgetattr
def validate_strain(strain, version='20200601'):
if version == '20200601':
return _validate_strain_v20200601(strain)
raise NotImplementedError('Only v20200601 is implemented')
def _validate_strain_v20200601(strain):
mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'},
{'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'},
{'label': 'Restriction on use', 'attr': 'restriction_on_use'},
{'label': 'Risk group', 'attr': 'risk_group'},
{'label': 'Organism type', 'attr': 'taxonomy.organism_type'},
{'label': 'Taxon name', 'attr': 'taxonomy.long_name'},
{'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'},
{'label': 'Recommended media', 'attr': 'growth.recommended_media'},
{'label': 'Form of supply', 'attr': 'form_of_supply'},
{'label': 'Country', 'attr': 'collect.location.country'}]
errors = []
for mandatory in mandatory_attrs:
value = rgetattr(strain, mandatory['attr'])
if value is None:
errors.append(f"{mandatory['label']} is mandatory field")
if not is_valid_nagoya(strain):
errors.append('Not compliant wih nagoya protocol requirements')
return errors
def is_valid_nagoya(strain):
# nagoya_requirements
_date = strain.collect.date
if _date is None:
_date = strain.isolation.date
if _date is None:
_date = strain.deposit.date
if _date is None:
_date = strain.catalog_inclusion_date
# print(_date)
year = None if _date is None else _date._year
if year is not None and year >= 2014 and strain.collect.location.country is None:
return False
return True

View File

@ -62,10 +62,6 @@ class Entity():
def GID(self) -> str:
return 'Genomic Information'
def VRS(self) -> str:
return 'Version'
def OTD(self) -> str:
return 'Ontobiotope'

View File

@ -93,9 +93,6 @@ class ErrorMessage():
def EFS08(self):
return "The 'Genomic information' sheet is missing. Please check the provided excel template."
def EFS09(self):
return "The 'Version' sheet is missing. Please check the provided excel template."
"""
Growth Media Error Codes
"""
@ -150,26 +147,26 @@ class ErrorMessage():
def LID03(self):
return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID04(self):
#return f"The 'Full reference' for literature with ID {self.pk} is missing."
def LID04(self):
return f"The 'Full reference' for literature with ID {self.pk} is missing."
def LID05(self):
return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID06(self):
#return f"The 'Authors' for literature with ID {self.pk} is missing."
def LID06(self):
return f"The 'Authors' for literature with ID {self.pk} is missing."
def LID07(self):
return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID08(self):
#return f"The 'Title' for literature with ID {self.pk} is missing."
def LID08(self):
return f"The 'Title' for literature with ID {self.pk} is missing."
def LID09(self):
return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID10(self):
#return f"The 'Journal' for literature with ID {self.pk} is missing."
def LID10(self):
return f"The 'Journal' for literature with ID {self.pk} is missing."
def LID11(self):
return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty."
@ -190,191 +187,167 @@ class ErrorMessage():
return f"The 'First page' for literature with ID {self.pk} is missing."
def LID17(self):
return( f"There are four types of ways to fill in the 'Literature' sheet.",
"1st- Columns 'ID' and 'DOI' must be obrigatory.",
"2nd-Columns 'ID' and 'PMID' are obrigatory.",
"3rd-Columns 'ID' and 'Full reference' are obrigatory.",
"In the alternative of these three types of forms not being filled in, we have:",
"4th-Columns 'ID', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'.")
def LID18(self):
return "The 'PMID' column is a mandatory field. The column can not be empty."
#def LID19(self):
#return f"PMID for literature with ID {self.pk} is missing."
def LID20(self):
return "The 'DOI' column is a mandatory field. The column can not be empty."
#def LID21(self):
#return f"DOI for literature with ID {self.pk} is missing."
msg = 'If journal; Title, Authors, journal, year and first page are required'
msg += 'If Book; Book Title, Authors, Year, Editors, Publishers'
return msg
"""
Strains Error Codes
"""
def STD01(self):
return "The 'accessionNumber' column is a mandatory field in the Strains sheet."
return "The 'Accession number' column is a mandatory field in the Strains sheet."
def STD02(self):
return "The 'accessionNumber' column is empty or has missing values."
return "The 'Accession number' column is empty or has missing values."
def STD03(self):
return f"The 'accessionNumber' must be unique. The '{self.value}' is repeated."
return f"The 'Accesion number' must be unique. The '{self.value}' is repeated."
def STD04(self):
return (f"The 'accessionNumber' {self.pk} is not according to the specification."
return (f"The 'Accession number' {self.pk} is not according to the specification."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD05(self):
return f"The 'useRestrictions' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD06(self):
return f"The 'useRestrictions' for strain with accessionNumber {self.pk} is missing."
return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing."
def STD07(self):
return (f"The 'useRestrictions' for strain with accessionNumber {self.pk} is not according to the specification."
return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD08(self):
return f"The 'nagoyaConditions' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD09(self):
return f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is missing."
return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing."
def STD10(self):
return (f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is not according to the specification."
return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD11(self):
return (f"The 'registeredCollection' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD12(self):
return "The 'riskGroup' column is a mandatory field in the Strains Sheet. The column can not be empty."
return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD13(self):
return f"The 'riskGroup' for strain with accessionNumber {self.pk} is missing."
return f"The 'Risk group' for strain with Accession Number {self.pk} is missing."
def STD14(self):
return (f"The 'riskGroup' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.")
def STD15(self):
return (f"The 'dualUse' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD16(self):
return (f"The “euQuarantine” for strain with accessionNumber {self.pk} is not according to specification."
return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD17(self):
return f"The 'organismType' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD18(self):
return f"The 'organismType' for strain with accessionNumber {self.pk} is missing."
return f"The 'Organism type' for strain with Accession Number {self.pk} is missing."
def STD19(self):
return (f"The 'organismType' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', "
"'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.")
def STD20(self):
return f"The 'speciesName' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD21(self):
return f"The 'speciesName' for strain with accessionNumber {self.pk} is missing."
return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing."
def STD22(self):
return f"The 'speciesName' for strain with accessionNumber {self.pk} is incorrect."
return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect."
def STD23(self):
return (f"The 'hybrid' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD24(self):
return (f"The 'depositHistory' for strain with accessionNumber {self.pk} is incorrect."
"The field includes entries separated by '<' meaning 'received from'."
"Entries may include persons or CCs. The name of the CC should be followed by"
"the month, when available, and year of the acquisition. Between parentheses,"
"the strain designation or CC numbers and/or a name can also be entered when "
"a name change has occurred.")
return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect."
def STD25(self):
return (f"The 'depositDate' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD26(self):
return (f"The 'accessionDate' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD27(self):
return (f"The 'collectionDate' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD28(self):
return (f"The 'isolationDate' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD29(self):
return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','")
def STD30(self):
return f"The 'temperatureGrowthRange' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD31(self):
return f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is missing."
return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing."
def STD32(self):
return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','.")
def STD33(self):
return ("The 'recommendedTemperature' column is a mandatory field in the Strains Sheet. The column can not be empty.")
return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD34(self):
return f"The 'recommendedTemperature' for strain with accessionNumber {self.pk} is missing."
return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing."
def STD35(self):
return f"The value of 'recommendedTemperature' for strain with accessionNumber {self.pk} is not in the Growth Media Sheet."
return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet."
def STD36(self):
return f"The 'supplyForms' column is a mandatory field in the Strains Sheet. The column can not be empty."
return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD37(self):
return f"The 'supplyForms' for strain with accessionNumber {self.pk} is missing."
return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing."
def STD38(self):
return f"The value of 'supplyForms' for strain with accessionNumber {self.pk} is not in the Forms of Supply Sheet."
return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet."
def STD39(self):
return (f"The 'geographicCoordinates' column for strain with accessionNumber {self.pk} is incorrect."
"The allowed formats are two, three or four decimal numbers separated by ','. Moreover, the first number must be."
"between [-90, 90], the second between [-180, 180], and the third and fourth refers to the precision and altitude, defined by decimal numbers."
"Put a question mark for lack of precision or altitude when one of them is missing. Leave the values blank when both are missing. ")
return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be"
"between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.")
def STD40(self):
return (f"The 'country' column for strain with accessionNumber {self.pk} is incorrect."
return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are one decimal number between [-200, 8000].")
def STD54(self):
return (f"The 'country'column is a mandatory field in the Strains Sheet. The column can not be empty.")
def STD55(self):
return (f"The 'country' for strain with accessionNumber {self.pk} is missing.")
def STD41(self):
return f"The value of 'ontobiotopeTerms' for strain with accessionNumber {self.pk} is not in the Ontobiotope Sheet."
return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet."
def STD42(self):
return (f"The 'gmo' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2")
def STD43(self):
return (f"The 'sexualState' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', "
"'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'")
def STD44(self):
return (f"The 'ploidy' for strain with accessionNumber {self.pk} is not according to specification."
return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9")
def STD45(self):
@ -383,97 +356,24 @@ class ErrorMessage():
return msg
def STD46(self):
return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is not according to specification."
f"The 'geographicOrigin' column must consist of the ID's associated with the Geographic origin sheet.")
msg = f"If date of collection/isolation/deposit/inclusion in the catalog is after 2014," \
f" the value of column Geographic Origin must be provided and associated with a country in the " \
f"Geographic Origin sheet. The value is missing or not associated with a country for strain {self.pk}."
return msg
def STD47(self):
return "The 'country' column is a mandatory field in the Strains sheet."
def STD48(self):
return "The 'country' column is empty or has missing values."
def STD49(self):
return (f"The “qps” for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD50(self):
return (f"The “axenicCulture” for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Axenic', 'Not axenic'.")
def STD51(self):
return f"The 'mirriAccessionNumber' must be unique. The '{self.pk}' is repeated."
def STD52(self):
return (f"The 'mirriAccessionNumber' for strain with accessionNumber {self.pk} is incorrect."
" It must have the expression MIRRI followed by 7 digits")
def STD53(self):
return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect."
" The displayed expression it should be composed of: site name ';' website url." )
def STD56(self):
return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. " )
def STD57(self):
return (f"The 'country' for strain with accessionNumber {self.pk} is incorrect."
"This information must be expressed by using the ISO-3166 standard for country"
"codes. The preferred set is ISO 3166-1 alpha-2 (two letters code), but ISO 3166-"
"1 alpha-3 (three letters code) is also accepted. Former country codes must"
"follow standards part three ISO 3166-3 (four letters code). Only one code can"
"be included." )
def STD58(self):
return (f"The 'mtaFile' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. " )
def STD59(self):
return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect."
"The displayed expression it should be composed of: name ';' website url."
"When only one URL is provided, the title may be omitted. In this case, the URL"
"will be shown in clear to users." )
def STD60(self):
return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. ")
def STD61(self):
return (f"The 'sequenceLiterature' for strain with accessionNumber {self.pk} is incorrect."
"Numeric identifiers separated by a semicolon ';'.")
def STD62(self):
return (f"The 'plasmidCollections' for strain with accessionNumber {self.pk} is incorrect."
"It should include the name of the plasmid followed by the CC number in"
"parentheses. More than one plasmid can be reported, separated by ';'. "
"Plasmid names should be provided as free text."
"CC numbers should be composed by the CC acronym followed by a number"
"separated by a space'. Numeric identifiers separated by a semicolon ';'.")
def STD63(self):
return (f"The 'otherCollectionNumbers' for strain with accessionNumber {self.pk} is incorrect."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD64(self):
return (f"The 'type' for strain with accessionNumber {self.pk} is incorrect."
f"Your value is {self.value} and the accepted values are 1, 2.")
def STD65(self):
return (f"The 'status' for strain with accessionNumber {self.pk} is incorrect."
"The structure should be 'type of <character string>.")
def STD68(self):
return (f"The 'geographicOrigin'column is a mandatory field in the Strains Sheet. The column can not be empty.")
def STD69(self):
return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is missing.")
"""
Genomic Information Error Codes
"""
def GID01(self):
return f"The 'Strain accessionNumber' (Strain AN) column is a mandatory field in the Genomic Information Sheet."
return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet."
def GID02(self):
return f"The 'Strain accessionNumber' (Strain AN) column is empty or has missing values."
return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values."
def GID03(self):
return f"The value of 'Strain accessionNumber' (Strain AN) {self.value} is not in the Strains sheet."
return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet."
def GID04(self):
return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
@ -497,35 +397,6 @@ class ErrorMessage():
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
" It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.")
def GID11(self):
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
"An INSDC accession number is an alphanumeric"
"code made by a fixed number of letters followed by a fixed number of digits,"
"without any separation. For sequences, the code is currently made of two"
"letters followed by six numbers.")
"""
Version Error Codes
"""
def VRS01(self):
return "The 'Version' columns is a mandatory field in the Version Sheet."
def VRS02(self):
return "The 'Version' columns is empty or has missing values."
def VRS03(self):
return "The 'Date' columns is a mandatory field in the Control Sheet."
def VRS04(self):
return "The 'Date' columns is empty or has missing values."
def VRS05(self):
return f"The version {self.value} is the only one to be used."
"""
Ontobiotope Error Codes
"""
@ -536,12 +407,8 @@ class ErrorMessage():
def OTD02(self):
return "The 'ID' columns is empty or has missing values."
#def OTD03(self):
def OTD03(self):
return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty."
#def OTD04(self):
def OTD04(self):
return f"The 'Name' for ontobiotope with ID {self.pk} is missing."

View File

@ -4,51 +4,27 @@ from io import BytesIO
from zipfile import BadZipfile
from datetime import datetime
from calendar import monthrange
import requests
from openpyxl import load_workbook
import pycountry
from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet
from mirri.validation.error_logging import ErrorLog, Error
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON,
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE,
HISTORY,NAGOYA1, VERSION)
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON,
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO)
from mirri.settings import LOCATIONS, SUBTAXAS
from mirri.validation.validation_conf_12052023 import version_config
from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF
from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF
def validate_mirri_excel(fhand, version= "5.1.2" ):
if version == "5.1.2":
configuration = MIRRI_12052023_VALLIDATION_CONF
def validate_mirri_excel(fhand, version="20200601"):
if version == "20200601":
configuration = MIRRI_20200601_VALLIDATION_CONF
else:
raise NotImplementedError("Only version 5.1.2 is implemented")
raise NotImplementedError("Only version20200601 is implemented")
return validate_excel(fhand, configuration)
def version(value , validation_conf=None):
if value is None:
return True
try:
for version in version_config:
if value == version :
return True
except:
return False
def validate_country_code(value,validation_conf=None):
if value is None:
return True
try:
if pycountry.countries.get(alpha_2=value) or pycountry.countries.get(alpha_3=value) or pycountry.historic_countries.get(alpha_4 = value):
return True
except:
return False
def validate_excel(fhand, configuration):
validation_conf = configuration['sheet_schema']
@ -209,14 +185,11 @@ def validate_row(row, validation_steps, in_memory_sheets):
kind = validation_step[TYPE]
error_code = validation_step[ERROR_CODE]
if kind == NAGOYA:
if not is_valid_nagoya_v12052023(row, in_memory_sheets):
if not is_valid_nagoya(row, in_memory_sheets):
return error_code
elif kind == BIBLIO:
if not is_valid_pub(row):
return error_code
elif kind == NAGOYA1:
if not is_valid_nago(row):
return error_code
else:
msg = f'{kind} is not a recognized row validation type method'
raise NotImplementedError(msg)
@ -235,69 +208,48 @@ def validate_cell(value, validation_steps, crossrefs, shown_values, label):
if error_code is not None:
return error_code
def is_valid_pub(row):
pub_id = row.get('ID', None)
pub_pmid = row.get('PMID', None)
pub_doi = row.get('DOI', None)
title = row.get('Title', None)
full_reference = row.get('Full reference', None)
authors = row.get('Authors', None)
journal = row.get('Journal', None)
year = row.get('Year', None)
volumen = row.get('Volume', None)
volumen = row.get('Volumen', None)
first_page = row.get('First page', None)
book_title = row.get('Book title', None)
editors = row.get('Editors', None)
publishers = row.get('Publishers', None)
if (pub_id != None and pub_doi != None) or (pub_id != None and pub_pmid != None) or (pub_id != None and full_reference != None) or (pub_id != None and authors != None and title != None and journal != None and year != None and volumen != None and first_page != None) :
if full_reference:
return True
is_journal = bool(title)
# if (is_journal and (not authors or not journal or not not year or
# not volumen or not first_page)):
# return False
#if (not is_journal and (not authors or not year or
# not editors or not publishers or not book_title)):
# return False
return False
def is_valid_nago(row):
if not row:
return True
status = row.get("status", None)
type = row.get("type", None)
regex = r'^[a-zA-Z\s.\'-]+$'
if status != None and type != None:
if (re.match(regex, status) and type==1):
return False
if (type == 2 and status is None):
if (is_journal and (not authors or not journal or not not year or
not volumen or not first_page)):
return False
if (not is_journal and (not authors or not year or
not editors or not publishers or not book_title)):
return False
return True
def parsee_mirri_excel(row, in_memory_sheets, version=""):
if version == "12052023":
return is_valid_nagoya_v12052023 (row, in_memory_sheets)
else:
raise NotImplementedError("Only version is implemented")
def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('geographicOrigin', None)
def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('Geographic origin', None)
if location_index is None:
country = None
else:
geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {})
country = geo_origin.get('Country', None)
_date = row.get("collectionDate", None)
_date = row.get("Date of collection", None)
if _date is None:
_date = row.get("isolationDate", None)
_date = row.get("Date of isolation", None)
if _date is None:
_date = row.get("depositDate", None)
_date = row.get("Date of deposit", None)
if _date is None:
_date = row.get("accessionDate", None)
_date = row.get("Date of inclusion in the catalogue", None)
if _date is not None:
year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4])
else:
@ -306,9 +258,9 @@ def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-i
if year is not None and year >= 2014 and country is None:
return False
return True
def is_valid_regex(value, validation_conf):
if value is None:
return True
@ -358,9 +310,7 @@ def is_valid_choices(value, validation_conf):
values = [v.strip() for v in str(value).split(separator)]
else:
values = [str(value).strip()]
sorted_values = sorted(values)
if sorted_values != values:
return False
return all(value in choices for value in values)
@ -402,136 +352,20 @@ def is_valid_date(value, validation_conf):
return True
def is_valid_dominio(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
if len(items) >1:
for i in range(0, len(items),2):
nameSite = str(items[i])
urlSite = str(items[i+1])
dominio = urlSite.split(".")[-2]
if nameSite.lower() != dominio:
return False
return True
except:
return False
def is_valid_title(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
if len(items) >1:
for i in range(0, len(items),2):
nameSite = (items[i])
urlSite = str(items[i+1])
regex = r'^(http|https):\/\/[a-z0-9\-\.]+\.[a-z]{2,}([/a-z0-9\-\.]*)*$'
if re.match(regex, nameSite) or isinstance(nameSite, int) or nameSite == '':
return False
return True
except:
return False
def is_valid_url_title(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
if len(items) ==1:
urlSite = str(items[0])
response = requests.head(urlSite)
if response.status_code != 200:
return False
else:
items = [i.strip() for i in value.split(";")]
for i in range(0, len(items),2):
nameSite = (items[i])
urlSite = str(items[i+1])
response = requests.head(urlSite)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_url_dominio(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
for i in range(0, len(items),2):
nameSite = str(items[i])
urlSite = str(items[i+1])
response = requests.head(urlSite)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_just_url(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
for i in items:
nameSite = str(items[0])
response = requests.head(i)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_history(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split("<")]
for i in items:
regex1 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*((19|20)\d{2})'
regex2 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*[a-zA-Z0-9 &,;.''-] (19|20)\d{2}\s\([a-zA-Z0-9 &,;.''-:]+\)'
regex3 = r'^[a-zA-Z0-9 &,;.:''-]+\,?\s*[a-zA-Z0-9 &,;.''-]'
regex4 = r'^[a-zA-Z0-9 &,;.''-]+,?\s*(19|20)\d{2}\s\([a-zA-Z0-9 .''-,;&:]+\)'
regex5 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*\([a-zA-Z0-9 &,;.''-:]+\) (19|20)\d{2}'
if re.match(regex1, i):
return True
elif re.match(regex2, i):
return True
elif re.match(regex3, i):
return True
elif re.match(regex4, i):
return True
elif re.match(regex5, i):
return True
else:
return False
except:
return False
def is_valid_coords(value, validation_conf=None):
# sourcery skip: return-identity
if value is None:
return True
try:
regex1 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)(\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?))*$'
regex2 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?)\s*;\s*(\d+\.\d+|\?)\s*;\s*(\d+\.\d+|\?)$|^(\d+\.\d+|\?)$|^\s*;\s*$'
if not re.match(regex1, value) and not re.match(regex2, value):
items = [i.strip() for i in value.split(";")]
latitude = float(items[0])
longitude = float(items[1])
if len(items) > 2:
precision = float(items[2])
if latitude < -90 or latitude > 90:
return False
if longitude < -180 or longitude > 180:
return False
return True
except:
return False
@ -541,6 +375,24 @@ def is_valid_missing(value, validation_conf=None):
return value is not None
def is_valid_number(value, validation_conf):
if value is None:
return True
try:
value = float(value)
except TypeError:
return False
except ValueError:
return False
_max = validation_conf.get('max', None)
_min = validation_conf.get('min', None)
if (_max is not None and value > _max) or (_min is not None and value < _min):
return False
return True
def is_valid_taxon(value, validation_conf=None):
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, ';')
@ -577,8 +429,6 @@ def _is_valid_taxon(value):
def is_valid_unique(value, validation_conf):
if not value:
return True
label = validation_conf['label']
shown_values = validation_conf['shown_values']
if label not in shown_values:
@ -594,6 +444,7 @@ def is_valid_unique(value, validation_conf):
return True
def is_valid_file(path):
try:
with path.open("rb") as fhand:
@ -613,15 +464,8 @@ VALIDATION_FUNCTIONS = {
CROSSREF: is_valid_crossrefs,
DATE: is_valid_date,
COORDINATES: is_valid_coords,
NUMBER: is_valid_number,
TAXON: is_valid_taxon,
TITLE: is_valid_title,
DOMINIO: is_valid_dominio,
URL_TITLE: is_valid_url_title,
URL_DOMINIO: is_valid_url_dominio,
JUST_URL: is_valid_just_url,
ISO: validate_country_code,
HISTORY: is_valid_history,
VERSION: version,
UNIQUE: is_valid_unique}

View File

@ -16,20 +16,9 @@ MATCH = 'match'
VALUES = 'values'
DATE = 'date'
COORDINATES = 'coord'
COORDINATES1 = 'coord1'
NUMBER = 'number'
TAXON = 'taxon'
UNIQUE = 'unique'
ROW_VALIDATION = 'row_validation'
NAGOYA = 'nagoya'
BIBLIO = 'bibliography'
DOMINIO= 'is_valid_dominio'
TITLE= 'is_valid_title'
URL_DOMINIO = 'urll_valid_dominio'
URL_TITLE= 'is_valid_url_title'
ISO = 'validate_country_code'
JUST_URL= 'is_valid_just_url'
HISTORY= 'is_valid_history'
MEU='is_valid_crossrefs_meu'
NAGOYA1 = 'nayoga1'
VERSION = 'version'

View File

@ -1,25 +0,0 @@
#!/usr/bin/env python
import pandas as pd
import sys
from pathlib import Path
import warnings
warnings.simplefilter("ignore")
from mirri.validation.excel_validator import validate_mirri_excel
def main():
path = Path(sys.argv[1])
version = str(sys.argv[2])
try:
error_log = validate_mirri_excel(path.open("rb"), version=version)
except NotImplementedError as e:
print(e)
for errors in error_log.get_errors().values():
for error in errors:
print(error.pk, error.message, error.code)
if __name__ == "__main__":
main()

View File

@ -1,13 +1,10 @@
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,VERSION,
VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1)
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,
VALIDATION, VALUES, BIBLIO)
from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET)
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS)
# GEOGRAPHIC_ORIGIN
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
@ -15,12 +12,9 @@ from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "accessionNumber",
FIELD: "Accession number",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
@ -29,24 +23,16 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "useRestrictions",
FIELD: "Restrictions on use",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "mirriAccessionNumber",
VALIDATION: [
{TYPE: UNIQUE, ERROR_CODE: 'STD51'},
{TYPE: REGEXP, MATCH: "^MIRRI[0-9]{7}$", ERROR_CODE: "STD52"},
],
},
{
FIELD: "nagoyaConditions",
FIELD: "Nagoya protocol restrictions and compliance conditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
@ -55,53 +41,29 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "absFile",
VALIDATION: [
{TYPE: TITLE, ERROR_CODE: "STD59"},
{TYPE: URL_TITLE, ERROR_CODE: "STD60",
MULTIPLE: True, SEPARATOR: ";"},
],
},
{
FIELD: "siteLinks",
VALIDATION: [
{TYPE: DOMINIO, ERROR_CODE: "STD53",
MULTIPLE: False, SEPARATOR: ";"},
{TYPE: URL_DOMINIO, ERROR_CODE: "STD56",
MULTIPLE: False, SEPARATOR: ";"},
],
FIELD: "ABS related files",
VALIDATION: [],
},
{
FIELD: "mtaFile",
VALIDATION: [
{TYPE: JUST_URL, ERROR_CODE: "STD58",
MULTIPLE: True, SEPARATOR: ";"},
],
FIELD: "MTA file",
VALIDATION: [],
},
{
FIELD: "otherCollectionNumbers",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([^ ]* [^ ]*)(; [^ ]* [^ ]*)*$", ERROR_CODE: "STD63",
MULTIPLE: True, SEPARATOR: ';'},
#{TYPE: CROSSREF, CROSSREF_NAME: "Strains", ERROR_CODE: "STD64"},
]
FIELD: "Other culture collection numbers",
# VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
# MULTIPLE: True, SEPARATOR: ";"}
# ]
},
{
FIELD: "registeredCollection",
FIELD: "Strain from a registered collection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "type",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD64"},
]
},
{
FIELD: "riskGroup",
FIELD: "Risk Group",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
@ -110,41 +72,33 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "dualUse",
FIELD: "Dual use",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "euQuarantine",
FIELD: "Quarantine in Europe",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "axenicCulture",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["Axenic", "Not axenic"],
ERROR_CODE: "STD50"}
]
},
{
FIELD: "organismType",
FIELD: "Organism type",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi", "Filamentous fungi",
"Yeast", "Microalgae",
"1", "2", "3", "4", "5", "6", "7"],
"Cyanobacteria", "Filamentous Fungi",
"Phage", "Plasmid", "Virus", "Yeast",
"1", "2", "3", "4", "5", "6", "7", "8", "9"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "speciesName",
FIELD: "Taxon name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
@ -153,69 +107,73 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "infrasubspecificNames",
VALIDATION: []
FIELD: "Infrasubspecific names",
},
{
FIELD: "taxonomyComments",
VALIDATION: []
FIELD: "Comment on taxonomy",
},
{
FIELD: "hybrid",
FIELD: "Interspecific hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "status",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^(type of|neotype of|holotype of |epitype of) ([a-zA-Z .'-]+)$", ERROR_CODE: "STD65"},
]
FIELD: "Status",
},
{
FIELD: "depositHistory",
FIELD: "History of deposit",
VALIDATION: [
{TYPE: HISTORY, ERROR_CODE: 'STD24'},
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex
# MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "depositor",
VALIDATION: []
FIELD: "Depositor"
},
{
FIELD: "depositDate",
FIELD: "Date of deposit",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "accessionDate",
FIELD: "Date of inclusion in the catalogue",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "collector",
VALIDATION: []
},
{
FIELD: "substrate",
VALIDATION: []
FIELD: "Collected by",
},
{
FIELD: "temperatureGrowthRange",
FIELD: "Date of collection",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD27"},
]
},
{
FIELD: "Isolated by",
},
{
FIELD: "Date of isolation",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD28"},
]
},
{
FIELD: "Substrate/host of isolation",
},
{
FIELD: "Tested temperature growth range",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "recommendedTemperature",
FIELD: "Recommended growth temperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
@ -224,9 +182,17 @@ STRAIN_FIELDS = [
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "supplyForms",
FIELD: "Recommended medium for growth",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "Form of supply",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
@ -236,70 +202,52 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "otherDenomination",
VALIDATION: []
FIELD: "Other denomination",
},
{
FIELD: "geographicCoordinates",
FIELD: "Coordinates of geographic origin",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
FIELD: "Altitude of geographic origin",
VALIDATION: [
{TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "geographicOrigin",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD68"},
{TYPE: MISSING, ERROR_CODE: "STD69"},
{TYPE: CROSSREF, CROSSREF_NAME: "Geographic origin", ERROR_CODE: "STD46"},
]
},
{
FIELD: "isolationHabitat",
VALIDATION: []
FIELD: "Geographic origin",
},
{
FIELD: "ontobiotopeTerms",
FIELD: "Isolation habitat",
},
{
FIELD: "Ontobiotope term for the isolation habitat",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "qps",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD49"}
]
},
{
FIELD: "gmo",
FIELD: "GMO",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "gmoConstruction",
VALIDATION: []
FIELD: "GMO construction information",
},
{
FIELD: "mutant",
VALIDATION: []
FIELD: "Mutant information",
},
{
FIELD: "genotype",
VALIDATION: []
FIELD: "Genotype",
},
{
FIELD: "Plant pathogenicity code",
VALIDATION: []
},
{
FIELD: "sexualState",
FIELD: "Sexual state",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
@ -310,78 +258,46 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "ploidy",
FIELD: "Ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4", "5", "9"],
{TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "plasmids",
VALIDATION: []
FIELD: "Plasmids",
},
{
FIELD: "plasmidCollections",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
ERROR_CODE: "STD62"}
]
FIELD: "Plasmids collections fields",
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "identificationLiterature",
FIELD: "Literature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "pathogenicity",
VALIDATION: []
FIELD: "Plant pathogenicity code",
},
{
FIELD: "enzymes",
VALIDATION: []
FIELD: "Pathogenicity",
},
{
FIELD: "metabolites",
VALIDATION: []
FIELD: "Enzyme production",
},
{
FIELD: "applications",
VALIDATION: []
FIELD: "Production of metabolites",
},
{
FIELD: "remarks",
VALIDATION: []
FIELD: "Applications",
},
{
FIELD: "sequenceLiterature",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"},
]
FIELD: "Remarks"
},
{
FIELD: "recommendedMedium",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD54"},
{TYPE: MISSING, ERROR_CODE: "STD55"},
{TYPE: ISO, ERROR_CODE: "STD57"},
#{TYPE: CROSSREF, CROSSREF_NAME: COUNTRY_CODES_SHEET, ERROR_CODE: "STD57"}
]
{
FIELD: "Literature linked to the sequence/genome",
},
]
SHEETS_SCHEMA = {
@ -401,7 +317,7 @@ SHEETS_SCHEMA = {
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"},
{TYPE: MISSING, ERROR_CODE: "GOD04"}
]
},
{
@ -473,7 +389,6 @@ SHEETS_SCHEMA = {
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
{TYPE: REGEXP, MATCH: "^[A-Z]{2}[0-9]{6}$", ERROR_CODE: "GID11"},
]
},
{
@ -484,9 +399,11 @@ SHEETS_SCHEMA = {
},
STRAINS: {
"acronym": "STD",
'id_field': 'accessionNumber',
'id_field': 'Accession number',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [],
ROW_VALIDATION: [
{TYPE: NAGOYA, ERROR_CODE: "STD46"},
],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
@ -504,18 +421,6 @@ SHEETS_SCHEMA = {
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "PMID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID18"},
]
},
{
FIELD: "DOI",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID20"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
@ -560,6 +465,7 @@ SHEETS_SCHEMA = {
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
{TYPE: MISSING, ERROR_CODE: "LID16"},
]
},
{
@ -598,38 +504,13 @@ SHEETS_SCHEMA = {
},
{
FIELD: "Name",
VALIDATION: []
},
]
},
CONTROL_SHEET: {
"acronym": "VRS",
"id_field": "Version",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS09"},
COLUMNS: [
{
FIELD: "Version",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "VRS01"},
{TYPE: MISSING, ERROR_CODE: "VRS02"},
{TYPE: VERSION, ERROR_CODE: "VRS05"},
]
},
{
FIELD: "Date",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "VRS03"},
{TYPE: MISSING, ERROR_CODE: "VRS04"},
{TYPE: MANDATORY, ERROR_CODE: "OTD03"},
{TYPE: MISSING, ERROR_CODE: "OTD04"},
]
},
]
},
MARKERS: {
"acronym": "MKD",
"id_field": "Acronym",
@ -646,28 +527,19 @@ SHEETS_SCHEMA = {
},
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID'],
LITERATURE_SHEET: ['ID', 'DOI', 'PMID', 'Full reference', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'],
LOCATIONS: ['ID', 'Locality'],
ONTOBIOTOPE: ['ID', 'Name'],
LITERATURE_SHEET: ['ID'],
LOCATIONS: ['Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["accessionNumber"],
STRAINS: ["Accession number"],
SEXUAL_STATE_SHEET: [],
MARKERS: ["Acronym"],
}
MIRRI_12052023_VALLIDATION_CONF = {
MIRRI_20200601_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}
version_config = {
'5.1.2': MIRRI_12052023_VALLIDATION_CONF,
'date': '12/05/2023'
}