This commit is contained in:
denisaborgesccg 2023-07-18 09:16:38 +01:00
parent 37b2bbce98
commit 2370686d72
24 changed files with 18 additions and 1008 deletions

Binary file not shown.

Binary file not shown.

View File

@ -19,12 +19,6 @@ from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO,
NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION,
ONLY_RESEARCH, ONTOBIOTOPE, ONLY_RESEARCH, ONTOBIOTOPE,
PUBLICATION_FIELDS, STRAINS, SUBTAXAS) PUBLICATION_FIELDS, STRAINS, SUBTAXAS)
from mirri.settings_v1 import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO,
GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS,
MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS,
NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION,
ONLY_RESEARCH, ONTOBIOTOPE,
PUBLICATION_FIELDS, STRAINS, SUBTAXAS)
from mirri.utils import get_country_from_name from mirri.utils import get_country_from_name
RESTRICTION_USE_TRANSLATOR = { RESTRICTION_USE_TRANSLATOR = {
@ -44,34 +38,12 @@ TRUEFALSE_TRANSLATOR = {
def parse_mirri_excel(fhand, version=""): def parse_mirri_excel(fhand, version=""):
if version == "20200602": if version == "5.1.2":
return _parse_mirri_v20200601(fhand)
elif version == "12052023":
return _parse_mirri_v12052023(fhand) return _parse_mirri_v12052023(fhand)
else: else:
raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") raise NotImplementedError("Only version is 5.1.2 implemented")
def _parse_mirri_v20200601(fhand):
fhand.seek(0)
file_content = BytesIO(fhand.read())
wb = load_workbook(filename=file_content, read_only=True, data_only=True)
locations = workbook_sheet_reader(wb, LOCATIONS)
ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE)
growth_media = list(parse_growth_media(wb))
markers = workbook_sheet_reader(wb, GENOMIC_INFO)
publications = list(parse_publications(wb))
strains = parse_strains(wb, locations=locations, growth_media=growth_media,
markers=markers, publications=publications,
ontobiotopes=ontobiotopes)
return {"strains": strains, "growth_media": growth_media}
def _parse_mirri_v12052023(fhand): def _parse_mirri_v12052023(fhand):
fhand.seek(0) fhand.seek(0)
file_content = BytesIO(fhand.read()) file_content = BytesIO(fhand.read())

View File

@ -5,7 +5,6 @@ from openpyxl.workbook.workbook import Workbook
from mirri import rgetattr from mirri import rgetattr
from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS
from mirri.settings_v1 import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS
from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR
INITIAL_SEXUAL_STATES = [ INITIAL_SEXUAL_STATES = [
@ -51,81 +50,9 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS]
def write_mirri_excel(path, strains, growth_media, version): def write_mirri_excel(path, strains, growth_media, version):
if version == "20200601": if version == "5.1.2":
_write_mirri_excel_20200601(path, strains, growth_media)
if version == "12052023":
_write_mirri_excel_12052023(path, strains, growth_media) _write_mirri_excel_12052023(path, strains, growth_media)
def _write_mirri_excel_20200601(path, strains, growth_media):
wb = Workbook()
write_markers_sheet(wb)
ontobiotope_path = DATA_DIR / "ontobiotopes.csv"
write_ontobiotopes(wb, ontobiotope_path)
write_growth_media(wb, growth_media)
growth_media_indexes = [str(gm.acronym) for gm in growth_media]
locations = {}
publications = {}
sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES))
genomic_markers = {}
strains_data = _deserialize_strains(strains, locations, growth_media_indexes,
publications, sexual_states, genomic_markers)
strains_data = list(strains_data)
# write strain to generate indexed data
strain_sheet = wb.create_sheet("Strains")
strain_sheet.append([field["label"] for field in MIRRI_FIELDS])
for strain_row in strains_data:
strain_sheet.append(strain_row)
redimension_cell_width(strain_sheet)
# write locations
loc_sheet = wb.create_sheet("Geographic origin")
loc_sheet.append(["ID", "Country", "Region", "City", "Locality"])
for index, loc_index in enumerate(locations.keys()):
location = locations[loc_index]
row = [index, location.country, location.state, location.municipality,
loc_index]
loc_sheet.append(row)
redimension_cell_width(loc_sheet)
# write publications
pub_sheet = wb.create_sheet("Literature")
pub_sheet.append(PUB_HEADERS)
for publication in publications.values():
row = []
for pub_field in PUBLICATION_FIELDS:
# if pub_field['attribute'] == 'id':
# value = index
value = getattr(publication, pub_field['attribute'], None)
row.append(value)
pub_sheet.append(row)
redimension_cell_width(pub_sheet)
# write sexual states
sex_sheet = wb.create_sheet("Sexual state")
for sex_state in sorted(list(sexual_states)):
sex_sheet.append([sex_state])
redimension_cell_width(sex_sheet)
# write genetic markers
markers_sheet = wb.create_sheet("Genomic information")
markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence'])
for strain_id, markers in genomic_markers.items():
for marker in markers:
row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq]
markers_sheet.append(row)
redimension_cell_width(markers_sheet)
del wb["Sheet"]
wb.save(str(path))
def _write_mirri_excel_12052023(path, strains, growth_media): def _write_mirri_excel_12052023(path, strains, growth_media):
wb = Workbook() wb = Workbook()

View File

@ -1,311 +0,0 @@
from pathlib import Path
from charset_normalizer import VERSION
DATA_DIR = Path(__file__).parent / "data"
ACCESSION_NUMBER = "accession_number"
RESTRICTION_ON_USE = "restriction_on_use"
NAGOYA_PROTOCOL = "nagoya_protocol"
ABS_RELATED_FILES = "abs_related_files"
MTA_FILES = "mta_file"
OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers"
STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection"
RISK_GROUP = "risk_group"
DUAL_USE = "dual_use"
QUARANTINE = "quarantine"
ORGANISM_TYPE = "organism_type"
TAXON_NAME = "taxon_name"
INFRASUBSPECIFIC_NAME = "infrasubspecific_names"
COMMENTS_ON_TAXONOMY = "comments_on_taxonomy"
STATUS = "status"
HISTORY_OF_DEPOSIT = "history_of_deposit"
DEPOSITOR = "depositor"
DATE_OF_DEPOSIT = "date_of_deposit"
COLLECTED_BY = "collected_by"
DATE_OF_COLLECTION = "date_of_collection"
ISOLATED_BY = "isolated_by"
DATE_OF_ISOLATION = "date_of_isolation"
DATE_OF_INCLUSION = "date_of_inclusion_on_catalog"
TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range"
RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature"
RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth"
FORM_OF_SUPPLY = "form_of_supply"
GEO_COORDS = "coordinates_of_geographic_origin"
ACCESSION_NAME = "other_denomination"
ALTITUDE = "altitude_of_geographic_origin"
GEOGRAPHIC_ORIGIN = "geographic_origin"
GMO = "gmo"
GMO_CONSTRUCTION_INFO = "gmo_construction_information"
MUTANT_INFORMATION = "mutant_information"
GENOTYPE = "genotype"
LITERATURE = "literature"
SEXUAL_STATE = "sexual_state"
PLOIDY = "ploidy"
INTERSPECIFIC_HYBRID = "interspecific_hybrid"
HYBRIDS = 'hybrids'
PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code"
PATHOGENICITY = "pathogenicity"
ENZYME_PRODUCTION = "enzyme_production"
PRODUCTION_OF_METABOLITES = "production_of_metabolites"
APPLICATIONS = "applications"
REMARKS = "remarks"
PLASMIDS = "plasmids"
PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields"
SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation"
ISOLATION_HABITAT = "isolation_habitat"
ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat"
LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome"
# StrainId
STRAIN_ID = "id"
COLLECTION_CODE = "collection_code"
STRAIN_PUI = "strain_pui"
STRAIN_URL = "strain_url"
ID_SYNONYMS = 'id_synonyms'
# Taxonomy
GENUS = "genus"
SPECIES = "species"
# Location
COUNTRY = "countryOfOriginCode"
SITE = "site"
STATE = "state"
PROVINCE = "province"
MUNICIPALITY = "municipality"
ISLAND = "island"
OTHER = "other"
LATITUDE = "latitude"
LONGITUDE = "longitude"
ALTITUDE = "altitude"
GEOREF_METHOD = "georeferencingMethod"
COORDUNCERTAINTY = "coordUncertainty"
COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference"
LOCATION = "location"
ALLOWED_COLLECTING_SITE_KEYS = [
COUNTRY,
STATE,
PROVINCE,
ISLAND,
MUNICIPALITY,
OTHER,
SITE,
LATITUDE,
LONGITUDE,
ALTITUDE,
GEOREF_METHOD,
COORDUNCERTAINTY,
COORD_SPATIAL_REFERENCE,
]
MIRRI_FIELDS = [
{"attribute": "id", "label": "Accession number"},
{"attribute": "restriction_on_use", "label": "Restrictions on use"},
{"attribute": "nagoya_protocol",
"label": "Nagoya protocol restrictions and compliance conditions"},
{"attribute": ABS_RELATED_FILES, "label": "ABS related files"},
{"attribute": "mta_files", "label": "MTA file"},
{"attribute": "other_numbers", "label": "Other culture collection numbers"},
{"attribute": "is_from_registered_collection",
"label": "Strain from a registered collection"},
{"attribute": "risk_group", "label": "Risk Group"},
{"attribute": "is_potentially_harmful", "label": "Dual use"},
{"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"},
{"attribute": "taxonomy.organism_type", "label": "Organism type"},
{"attribute": "taxonomy.taxon_name", "label": "Taxon name"},
{"attribute": "taxonomy.infrasubspecific_name",
"label": "Infrasubspecific names"},
{"attribute": "taxonomy.comments", "label": "Comment on taxonomy"},
{"attribute": "taxonomy.interspecific_hybrid",
"label": "Interspecific hybrid"},
{"attribute": "status", "label": "Status"},
{"attribute": "history", "label": "History of deposit", },
{"attribute": "deposit.who", "label": "Depositor"},
{"attribute": "deposit.date", "label": "Date of deposit"},
{"attribute": "catalog_inclusion_date",
"label": "Date of inclusion in the catalogue"},
{"attribute": "collect.who", "label": "Collected by"},
{"attribute": "collect.date", "label": "Date of collection"},
{"attribute": "isolation.who", "label": "Isolated by"},
{"attribute": "isolation.date", "label": "Date of isolation"},
{"attribute": "isolation.substrate_host_of_isolation",
"label": "Substrate/host of isolation"},
{"attribute": "growth.tested_temp_range",
"label": "Tested temperature growth range"},
{"attribute": "growth.recommended_temp",
"label": "Recommended growth temperature"},
{"attribute": "growth.recommended_media",
"label": "Recommended medium for growth"},
{"attribute": "form_of_supply", "label": "Form of supply"},
{"attribute": "other_denominations", "label": "Other denomination"},
{"attribute": "collect.location.coords",
"label": "Coordinates of geographic origin"},
{"attribute": "collect.location.altitude",
"label": "Altitude of geographic origin"},
{"attribute": "collect.location", "label": "Geographic origin"},
{"attribute": "collect.habitat", "label": "Isolation habitat"},
{"attribute": "collect.habitat_ontobiotope",
"label": "Ontobiotope term for the isolation habitat"},
{"attribute": "genetics.gmo", "label": "GMO"},
{"attribute": "genetics.gmo_construction",
"label": "GMO construction information"},
{"attribute": "genetics.mutant_info", "label": "Mutant information"},
{"attribute": "genetics.genotype", "label": "Genotype"},
{"attribute": "genetics.sexual_state", "label": "Sexual state"},
{"attribute": "genetics.ploidy", "label": "Ploidy"},
{"attribute": "genetics.plasmids", "label": "Plasmids"},
{"attribute": "genetics.plasmids_in_collections",
"label": "Plasmids collections fields"},
{"attribute": "publications", "label": "Literature"},
{"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"},
{"attribute": "pathogenicity", "label": "Pathogenicity"},
{"attribute": "enzyme_production", "label": "Enzyme production"},
{"attribute": "production_of_metabolites",
"label": "Production of metabolites"},
{"attribute": "applications", "label": "Applications", },
{"attribute": "remarks", "label": "Remarks"},
{"attribute": LITERATURE_LINKED_TO_SEQ_GENOME,
"label": "Literature linked to the sequence/genome"},
]
ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma",
'forma.specialis']
ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA
# nagoya
NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol"
NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection"
NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection"
ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS,
NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE]
# Use restriction
NO_RESTRICTION = "no_restriction"
ONLY_RESEARCH = "only_research"
COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement"
ALLOWED_RESTRICTION_USE_OPTIONS = [
NO_RESTRICTION,
ONLY_RESEARCH,
COMMERCIAL_USE_WITH_AGREEMENT,
]
ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"]
AGAR = "Agar"
CRYO = "Cryo"
DRY_ICE = "Dry Ice"
LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium"
LYO = "Lyo"
OIL = "Oil"
WATER = "Water"
ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE,
LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER]
DEPOSIT = "deposit"
ISOLATION = "isolation"
COLLECT = "collect"
GROWTH = "growth"
GENETICS = "genetics"
TAXONOMY = "taxonomy"
# Markers
MARKERS = "markers"
MARKER_TYPE = "marker_type"
MARKER_INSDC = "INSDC"
MARKER_SEQ = "marker_seq"
ALLOWED_MARKER_TYPES = [
{"acronym": "16S rRNA", "marker": "16S rRNA"},
{"acronym": "ACT", "marker": "Actin"},
{"acronym": "CaM", "marker": "Calmodulin"},
{"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"},
{"acronym": "ITS",
"marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"},
{"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"},
{"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"},
{"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"},
{"acronym": "TUBB", "marker": "β-Tubulin"},
]
PUBLICATIONS = "publications"
PUB_ID = "id"
PUB_DOI = "pub_doi"
PUB_PUBMED_ID = ''
PUB_FULL_REFERENCE = "full_reference"
PUB_TITLE = "title"
PUB_AUTHORS = "authors"
PUB_JOURNAL = "journal"
PUB_YEAR = "year"
PUB_VOLUME = "volume"
PUB_ISSUE = "issue"
PUB_FIRST_PAGE = "first_page"
PUB_LAST_PAGE = "last_page"
BOOK_TITLE = "book_title"
BOOK_EDITOR = "book_editor"
BOOK_PUBLISHER = "book_publisher"
PUBLICATION_FIELDS = [
{"label": "ID", "attribute": PUB_ID},
{"label": "Full reference", "attribute": PUB_FULL_REFERENCE},
{"label": "Authors", "attribute": PUB_AUTHORS},
{"label": "Title", "attribute": PUB_TITLE},
{"label": "Journal", "attribute": PUB_JOURNAL},
{"label": "Year", "attribute": PUB_YEAR},
{"label": "Volume", "attribute": PUB_VOLUME},
{"label": "Issue", "attribute": PUB_ISSUE},
{"label": "First page", "attribute": PUB_FIRST_PAGE},
{"label": "Last page", "attribute": PUB_FIRST_PAGE},
{"label": "Book title", "attribute": BOOK_TITLE},
{"label": "Editors", "attribute": BOOK_EDITOR},
{"label": "Publisher", "attribute": BOOK_PUBLISHER},
]
# ploidy
ANEUPLOID = 0
HAPLOID = 1
DIPLOID = 2
TRIPLOID = 3
TETRAPLOID = 4
POLYPLOID = 9
ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID,
POLYPLOID]
SUBTAXAS = {
"subsp.": "subspecies",
"var.": "variety",
"convar.": "convarietas",
"group.": "group",
"f.": "forma",
"f.sp.": "forma.specialis"
}
#Control
VERSION = "Version"
DATE = "Date"
#Controle files
CONTROL_FIELDS = [
{"label": "Version", "attribute": VERSION},
{"label": "Date", "attribute": DATE},
]
# Excel sheet name
LOCATIONS = "Geographic origin" # 'Locations'
GROWTH_MEDIA = "Growth media"
GENOMIC_INFO = "Genomic information"
STRAINS = "Strains"
LITERATURE_SHEET = "Literature"
SEXUAL_STATE_SHEET = "Sexual state"
RESOURCE_TYPES_VALUES = "Resource types values"
FORM_OF_SUPPLY_SHEET = "Forms of supply"
PLOIDY_SHEET = "Ploidy"
ONTOBIOTOPE = "Ontobiotope"
MARKERS = "Markers"
CONTROL_SHEET = "Version"

Binary file not shown.

Binary file not shown.

View File

@ -16,20 +16,19 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE, TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE,
HISTORY,NAGOYA1, VERSION) HISTORY,NAGOYA1, VERSION)
from mirri.settings import LOCATIONS, SUBTAXAS from mirri.settings import LOCATIONS, SUBTAXAS
from mirri.settings_v1 import LOCATIONS, SUBTAXAS
from mirri.validation.validation_conf_12052023 import version_config from mirri.validation.validation_conf_12052023 import version_config
from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF
def validate_mirri_excel(fhand, version= "5.1.2" ):
if version == "5.1.2":
configuration = MIRRI_12052023_VALLIDATION_CONF
else:
raise NotImplementedError("Only version 5.1.2 is implemented")
def validate_mirri_excel(fhand, version="", date=""):
configuration = version_config.get(version)
if configuration is None:
raise NotImplementedError("Unsupported version: " + version)
configuration["date"] = date or configuration.get("date")
if configuration["date"] != "12/05/2023":
raise ValueError("Invalid date. Expected: 12/05/2023")
return validate_excel(fhand, configuration) return validate_excel(fhand, configuration)
def version(value , validation_conf=None): def version(value , validation_conf=None):
if value is None: if value is None:
return True return True
@ -210,8 +209,6 @@ def validate_row(row, validation_steps, in_memory_sheets):
kind = validation_step[TYPE] kind = validation_step[TYPE]
error_code = validation_step[ERROR_CODE] error_code = validation_step[ERROR_CODE]
if kind == NAGOYA: if kind == NAGOYA:
if not is_valid_nagoya_v20200601(row, in_memory_sheets):
return error_code
if not is_valid_nagoya_v12052023(row, in_memory_sheets): if not is_valid_nagoya_v12052023(row, in_memory_sheets):
return error_code return error_code
elif kind == BIBLIO: elif kind == BIBLIO:
@ -281,39 +278,10 @@ def is_valid_nago(row):
return True return True
def parsee_mirri_excel(row, in_memory_sheets, version=""): def parsee_mirri_excel(row, in_memory_sheets, version=""):
if version == "20200601": if version == "12052023":
return is_valid_nagoya_v20200601 (row, in_memory_sheets)
elif version == "12052023":
return is_valid_nagoya_v12052023 (row, in_memory_sheets) return is_valid_nagoya_v12052023 (row, in_memory_sheets)
else: else:
raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") raise NotImplementedError("Only version is implemented")
def is_valid_nagoya_v20200601(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('Geographic origin', None)
if location_index is None:
country = None
else:
geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {})
country = geo_origin.get('Country', None)
_date = row.get("Date of collection", None)
if _date is None:
_date = row.get("Date of isolation", None)
if _date is None:
_date = row.get("Date of deposit", None)
if _date is None:
_date = row.get("Date of inclusion in the catalogue", None)
if _date is not None:
year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4])
else:
year = None
if year is not None and year >= 2014 and country is None:
return False
return True
def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('geographicOrigin', None) location_index = row.get('geographicOrigin', None)

View File

@ -9,10 +9,9 @@ from mirri.validation.excel_validator import validate_mirri_excel
def main(): def main():
path = Path(sys.argv[1]) path = Path(sys.argv[1])
version = str(sys.argv[2]) version = str(sys.argv[2])
date = str(sys.argv[3])
try: try:
error_log = validate_mirri_excel(path.open("rb"), version=version, date=date) error_log = validate_mirri_excel(path.open("rb"), version=version)
except NotImplementedError as e: except NotImplementedError as e:
print(e) print(e)

View File

@ -4,7 +4,7 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS
UNIQUE,VERSION, UNIQUE,VERSION,
VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1) VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1)
from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET,) STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET)
@ -323,7 +323,7 @@ STRAIN_FIELDS = [
{ {
FIELD: "plasmidCollections", FIELD: "plasmidCollections",
VALIDATION: [ VALIDATION: [
{TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$", {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
ERROR_CODE: "STD62"} ERROR_CODE: "STD62"}
] ]
}, },
@ -358,7 +358,7 @@ STRAIN_FIELDS = [
{ {
FIELD: "sequenceLiterature", FIELD: "sequenceLiterature",
VALIDATION: [ VALIDATION: [
{TYPE: REGEXP, MATCH: "^\d+(;?\s*\d+)*$", ERROR_CODE: "STD61"}, {TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"},
] ]
}, },

View File

@ -1,545 +0,0 @@
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,
VALIDATION, VALUES, BIBLIO)
from mirri.settings_v1 import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS)
# GEOGRAPHIC_ORIGIN
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
# FORM_OF_SUPPLY_SHEET,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "Accession number",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
{TYPE: MISSING, ERROR_CODE: "STD02"},
{TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"}
]
},
{
FIELD: "Restrictions on use",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "Nagoya protocol restrictions and compliance conditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD10"}
]
},
{
FIELD: "ABS related files",
VALIDATION: [],
},
{
FIELD: "MTA file",
VALIDATION: [],
},
{
FIELD: "Other culture collection numbers",
# VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
# MULTIPLE: True, SEPARATOR: ";"}
# ]
},
{
FIELD: "Strain from a registered collection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "Risk Group",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4"],
MULTIPLE: False, ERROR_CODE: "STD14"}
]
},
{
FIELD: "Dual use",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "Quarantine in Europe",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "Organism type",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi",
"Phage", "Plasmid", "Virus", "Yeast",
"1", "2", "3", "4", "5", "6", "7", "8", "9"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "Taxon name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
{TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True,
SEPARATOR: ';'}
]
},
{
FIELD: "Infrasubspecific names",
},
{
FIELD: "Comment on taxonomy",
},
{
FIELD: "Interspecific hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "Status",
},
{
FIELD: "History of deposit",
VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex
# MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Depositor"
},
{
FIELD: "Date of deposit",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "Date of inclusion in the catalogue",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "Collected by",
},
{
FIELD: "Date of collection",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD27"},
]
},
{
FIELD: "Isolated by",
},
{
FIELD: "Date of isolation",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD28"},
]
},
{
FIELD: "Substrate/host of isolation",
},
{
FIELD: "Tested temperature growth range",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended growth temperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD32",
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended medium for growth",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "Form of supply",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
{TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium',
'Lyo', 'Oil', 'Water'],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"}
]
},
{
FIELD: "Other denomination",
},
{
FIELD: "Coordinates of geographic origin",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
FIELD: "Altitude of geographic origin",
VALIDATION: [
{TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Geographic origin",
},
{
FIELD: "Isolation habitat",
},
{
FIELD: "Ontobiotope term for the isolation habitat",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "GMO",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "GMO construction information",
},
{
FIELD: "Mutant information",
},
{
FIELD: "Genotype",
},
{
FIELD: "Sexual state",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
# {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha",
# "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha",
# "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"],
# ERROR_CODE: "STD43"}
]
},
{
FIELD: "Ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "Plasmids",
},
{
FIELD: "Plasmids collections fields",
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Literature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "Plant pathogenicity code",
},
{
FIELD: "Pathogenicity",
},
{
FIELD: "Enzyme production",
},
{
FIELD: "Production of metabolites",
},
{
FIELD: "Applications",
},
{
FIELD: "Remarks"
},
{
FIELD: "Literature linked to the sequence/genome",
},
]
SHEETS_SCHEMA = {
LOCATIONS: {
"acronym": "GOD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD01"},
{TYPE: MISSING, ERROR_CODE: "GOD02"},
]
},
{
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"}
]
},
{
FIELD: "Region",
VALIDATION: []
},
{
FIELD: "City",
VALIDATION: []
},
{
FIELD: "Locality",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD06"},
{TYPE: MISSING, ERROR_CODE: "GOD07"}
]
}
],
},
GROWTH_MEDIA: {
"acronym": "GMD",
"id_field": "Acronym",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"},
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD01"},
{TYPE: MISSING, ERROR_CODE: "GMD02"}
]
},
{
FIELD: "Description",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD03"},
{TYPE: MISSING, ERROR_CODE: "GMD04"}
]
},
{
FIELD: "Full description",
VALIDATION: []
},
],
},
GENOMIC_INFO: {
"acronym": "GID",
"id_field": "Strain AN",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"},
COLUMNS: [
{
FIELD: "Strain AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID01"},
{TYPE: MISSING, ERROR_CODE: "GID02"},
{TYPE: CROSSREF, CROSSREF_NAME: "Strains",
ERROR_CODE: "GID03"},
]
},
{
FIELD: "Marker",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID04"},
{TYPE: MISSING, ERROR_CODE: "GID05"},
{TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"}
]
},
{
FIELD: "INSDC AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
]
},
{
FIELD: "Sequence",
VALIDATION: []
},
],
},
STRAINS: {
"acronym": "STD",
'id_field': 'Accession number',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [
{TYPE: NAGOYA, ERROR_CODE: "STD46"},
],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
"acronym": "LID",
'id_field': 'ID',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"},
ROW_VALIDATION: [
{TYPE: BIBLIO, ERROR_CODE: 'LID17'}
],
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID01"},
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID03"},
]
},
{
FIELD: "Authors",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID05"},
]
},
{
FIELD: "Title",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID07"},
]
},
{
FIELD: "Journal",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID09"},
]
},
{
FIELD: "Year",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID11"},
]
},
{
FIELD: "Volume",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID13"},
]
},
{
FIELD: "Issue",
VALIDATION: []
},
{
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
{TYPE: MISSING, ERROR_CODE: "LID16"},
]
},
{
FIELD: "Last page",
VALIDATION: []
},
{
FIELD: "Book title",
VALIDATION: []
},
{
FIELD: "Editors",
VALIDATION: []
},
{
FIELD: "Publisher",
VALIDATION: []
}
],
},
# SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []},
# RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []},
# FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []},
# PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []},
ONTOBIOTOPE: {
"acronym": "OTD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD01"},
{TYPE: MISSING, ERROR_CODE: "OTD02"},
]
},
{
FIELD: "Name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD03"},
{TYPE: MISSING, ERROR_CODE: "OTD04"},
]
},
]
},
MARKERS: {
"acronym": "MKD",
"id_field": "Acronym",
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: []
},
{
FIELD: "Marker",
VALIDATION: []
},
],
},
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID', 'Name'],
LITERATURE_SHEET: ['ID'],
LOCATIONS: ['Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["Accession number"],
SEXUAL_STATE_SHEET: [],
MARKERS: ["Acronym"],
}
MIRRI_20200601_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}