mirri_utils/validation/validation_conf_12052023.py

674 lines
18 KiB
Python

from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,VERSION,
VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1)
from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET,)
# GEOGRAPHIC_ORIGIN
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
# FORM_OF_SUPPLY_SHEET,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "accessionNumber",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
{TYPE: MISSING, ERROR_CODE: "STD02"},
{TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"}
]
},
{
FIELD: "useRestrictions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "mirriAccessionNumber",
VALIDATION: [
{TYPE: UNIQUE, ERROR_CODE: 'STD51'},
{TYPE: REGEXP, MATCH: "^MIRRI[0-9]{7}$", ERROR_CODE: "STD52"},
],
},
{
FIELD: "nagoyaConditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD10"}
]
},
{
FIELD: "absFile",
VALIDATION: [
{TYPE: TITLE, ERROR_CODE: "STD59"},
{TYPE: URL_TITLE, ERROR_CODE: "STD60",
MULTIPLE: True, SEPARATOR: ";"},
],
},
{
FIELD: "siteLinks",
VALIDATION: [
{TYPE: DOMINIO, ERROR_CODE: "STD53",
MULTIPLE: False, SEPARATOR: ";"},
{TYPE: URL_DOMINIO, ERROR_CODE: "STD56",
MULTIPLE: False, SEPARATOR: ";"},
],
},
{
FIELD: "mtaFile",
VALIDATION: [
{TYPE: JUST_URL, ERROR_CODE: "STD58",
MULTIPLE: True, SEPARATOR: ";"},
],
},
{
FIELD: "otherCollectionNumbers",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([^ ]* [^ ]*)(; [^ ]* [^ ]*)*$", ERROR_CODE: "STD63",
MULTIPLE: True, SEPARATOR: ';'},
#{TYPE: CROSSREF, CROSSREF_NAME: "Strains", ERROR_CODE: "STD64"},
]
},
{
FIELD: "registeredCollection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "type",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD64"},
]
},
{
FIELD: "riskGroup",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4"],
MULTIPLE: False, ERROR_CODE: "STD14"}
]
},
{
FIELD: "dualUse",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "euQuarantine",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "axenicCulture",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["Axenic", "Not axenic"],
ERROR_CODE: "STD50"}
]
},
{
FIELD: "organismType",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi", "Filamentous fungi",
"Yeast", "Microalgae",
"1", "2", "3", "4", "5", "6", "7"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "speciesName",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
{TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True,
SEPARATOR: ';'}
]
},
{
FIELD: "infrasubspecificNames",
VALIDATION: []
},
{
FIELD: "taxonomyComments",
VALIDATION: []
},
{
FIELD: "hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "status",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^(type of|neotype of|holotype of |epitype of) ([a-zA-Z .'-]+)$", ERROR_CODE: "STD65"},
]
},
{
FIELD: "depositHistory",
VALIDATION: [
{TYPE: HISTORY, ERROR_CODE: 'STD24'},
]
},
{
FIELD: "depositor",
VALIDATION: []
},
{
FIELD: "depositDate",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "accessionDate",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "collector",
VALIDATION: []
},
{
FIELD: "substrate",
VALIDATION: []
},
{
FIELD: "temperatureGrowthRange",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "recommendedTemperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD32",
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "supplyForms",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
{TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium',
'Lyo', 'Oil', 'Water'],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"}
]
},
{
FIELD: "otherDenomination",
VALIDATION: []
},
{
FIELD: "geographicCoordinates",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "geographicOrigin",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD68"},
{TYPE: MISSING, ERROR_CODE: "STD69"},
{TYPE: CROSSREF, CROSSREF_NAME: "Geographic origin", ERROR_CODE: "STD46"},
]
},
{
FIELD: "isolationHabitat",
VALIDATION: []
},
{
FIELD: "ontobiotopeTerms",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "qps",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD49"}
]
},
{
FIELD: "gmo",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "gmoConstruction",
VALIDATION: []
},
{
FIELD: "mutant",
VALIDATION: []
},
{
FIELD: "genotype",
VALIDATION: []
},
{
FIELD: "Plant pathogenicity code",
VALIDATION: []
},
{
FIELD: "sexualState",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
# {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha",
# "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha",
# "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"],
# ERROR_CODE: "STD43"}
]
},
{
FIELD: "ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4", "5", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "plasmids",
VALIDATION: []
},
{
FIELD: "plasmidCollections",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
ERROR_CODE: "STD62"}
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "identificationLiterature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "pathogenicity",
VALIDATION: []
},
{
FIELD: "enzymes",
VALIDATION: []
},
{
FIELD: "metabolites",
VALIDATION: []
},
{
FIELD: "applications",
VALIDATION: []
},
{
FIELD: "remarks",
VALIDATION: []
},
{
FIELD: "sequenceLiterature",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^\d+(;?\s*\d+)*$", ERROR_CODE: "STD61"},
]
},
{
FIELD: "recommendedMedium",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD54"},
{TYPE: MISSING, ERROR_CODE: "STD55"},
{TYPE: ISO, ERROR_CODE: "STD57"},
#{TYPE: CROSSREF, CROSSREF_NAME: COUNTRY_CODES_SHEET, ERROR_CODE: "STD57"}
]
},
]
SHEETS_SCHEMA = {
LOCATIONS: {
"acronym": "GOD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD01"},
{TYPE: MISSING, ERROR_CODE: "GOD02"},
]
},
{
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"},
]
},
{
FIELD: "Region",
VALIDATION: []
},
{
FIELD: "City",
VALIDATION: []
},
{
FIELD: "Locality",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD06"},
{TYPE: MISSING, ERROR_CODE: "GOD07"}
]
}
],
},
GROWTH_MEDIA: {
"acronym": "GMD",
"id_field": "Acronym",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"},
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD01"},
{TYPE: MISSING, ERROR_CODE: "GMD02"}
]
},
{
FIELD: "Description",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD03"},
{TYPE: MISSING, ERROR_CODE: "GMD04"}
]
},
{
FIELD: "Full description",
VALIDATION: []
},
],
},
GENOMIC_INFO: {
"acronym": "GID",
"id_field": "Strain AN",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"},
COLUMNS: [
{
FIELD: "Strain AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID01"},
{TYPE: MISSING, ERROR_CODE: "GID02"},
{TYPE: CROSSREF, CROSSREF_NAME: "Strains",
ERROR_CODE: "GID03"},
]
},
{
FIELD: "Marker",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID04"},
{TYPE: MISSING, ERROR_CODE: "GID05"},
{TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"}
]
},
{
FIELD: "INSDC AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
{TYPE: REGEXP, MATCH: "^[A-Z]{2}[0-9]{6}$", ERROR_CODE: "GID11"},
]
},
{
FIELD: "Sequence",
VALIDATION: []
},
],
},
STRAINS: {
"acronym": "STD",
'id_field': 'accessionNumber',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
"acronym": "LID",
'id_field': 'ID',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"},
ROW_VALIDATION: [
{TYPE: BIBLIO, ERROR_CODE: 'LID17'}
],
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID01"},
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "PMID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID18"},
]
},
{
FIELD: "DOI",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID20"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID03"},
]
},
{
FIELD: "Authors",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID05"},
]
},
{
FIELD: "Title",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID07"},
]
},
{
FIELD: "Journal",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID09"},
]
},
{
FIELD: "Year",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID11"},
]
},
{
FIELD: "Volume",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID13"},
]
},
{
FIELD: "Issue",
VALIDATION: []
},
{
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
]
},
{
FIELD: "Last page",
VALIDATION: []
},
{
FIELD: "Book title",
VALIDATION: []
},
{
FIELD: "Editors",
VALIDATION: []
},
{
FIELD: "Publisher",
VALIDATION: []
}
],
},
# SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []},
# RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []},
# FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []},
# PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []},
ONTOBIOTOPE: {
"acronym": "OTD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD01"},
{TYPE: MISSING, ERROR_CODE: "OTD02"},
]
},
{
FIELD: "Name",
VALIDATION: []
},
]
},
CONTROL_SHEET: {
"acronym": "CTR",
"id_field": "Version",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS09"},
COLUMNS: [
{
FIELD: "Version",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "CTR01"},
{TYPE: MISSING, ERROR_CODE: "CTR02"},
{TYPE: VERSION, ERROR_CODE: "CTR05"},
]
},
{
FIELD: "Date",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "CTR03"},
{TYPE: MISSING, ERROR_CODE: "CTR04"},
]
},
]
},
MARKERS: {
"acronym": "MKD",
"id_field": "Acronym",
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: []
},
{
FIELD: "Marker",
VALIDATION: []
},
],
},
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID'],
LITERATURE_SHEET: ['ID', 'DOI', 'PMID', 'Full reference', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'],
LOCATIONS: ['ID', 'Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["accessionNumber"],
SEXUAL_STATE_SHEET: [],
MARKERS: ["Acronym"],
}
MIRRI_12052023_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}
version_config = {
'5.1.2': MIRRI_12052023_VALLIDATION_CONF,
'date': '12/05/2023'
}