mirri_utils/mirri/validation/validation_conf_20200601.py

546 lines
15 KiB
Python

from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,
VALIDATION, VALUES, BIBLIO)
from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS)
# GEOGRAPHIC_ORIGIN
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
# FORM_OF_SUPPLY_SHEET,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "Accession number",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
{TYPE: MISSING, ERROR_CODE: "STD02"},
{TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"}
]
},
{
FIELD: "Restrictions on use",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "Nagoya protocol restrictions and compliance conditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD10"}
]
},
{
FIELD: "ABS related files",
VALIDATION: [],
},
{
FIELD: "MTA file",
VALIDATION: [],
},
{
FIELD: "Other culture collection numbers",
# VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
# MULTIPLE: True, SEPARATOR: ";"}
# ]
},
{
FIELD: "Strain from a registered collection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "Risk Group",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4"],
MULTIPLE: False, ERROR_CODE: "STD14"}
]
},
{
FIELD: "Dual use",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "Quarantine in Europe",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "Organism type",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi",
"Phage", "Plasmid", "Virus", "Yeast",
"1", "2", "3", "4", "5", "6", "7", "8", "9"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "Taxon name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
{TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True,
SEPARATOR: ';'}
]
},
{
FIELD: "Infrasubspecific names",
},
{
FIELD: "Comment on taxonomy",
},
{
FIELD: "Interspecific hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "Status",
},
{
FIELD: "History of deposit",
VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex
# MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Depositor"
},
{
FIELD: "Date of deposit",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "Date of inclusion in the catalogue",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "Collected by",
},
{
FIELD: "Date of collection",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD27"},
]
},
{
FIELD: "Isolated by",
},
{
FIELD: "Date of isolation",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD28"},
]
},
{
FIELD: "Substrate/host of isolation",
},
{
FIELD: "Tested temperature growth range",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended growth temperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD32",
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended medium for growth",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "Form of supply",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
{TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium',
'Lyo', 'Oil', 'Water'],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"}
]
},
{
FIELD: "Other denomination",
},
{
FIELD: "Coordinates of geographic origin",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
FIELD: "Altitude of geographic origin",
VALIDATION: [
{TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Geographic origin",
},
{
FIELD: "Isolation habitat",
},
{
FIELD: "Ontobiotope term for the isolation habitat",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "GMO",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "GMO construction information",
},
{
FIELD: "Mutant information",
},
{
FIELD: "Genotype",
},
{
FIELD: "Sexual state",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
# {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha",
# "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha",
# "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"],
# ERROR_CODE: "STD43"}
]
},
{
FIELD: "Ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "Plasmids",
},
{
FIELD: "Plasmids collections fields",
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Literature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "Plant pathogenicity code",
},
{
FIELD: "Pathogenicity",
},
{
FIELD: "Enzyme production",
},
{
FIELD: "Production of metabolites",
},
{
FIELD: "Applications",
},
{
FIELD: "Remarks"
},
{
FIELD: "Literature linked to the sequence/genome",
},
]
SHEETS_SCHEMA = {
LOCATIONS: {
"acronym": "GOD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD01"},
{TYPE: MISSING, ERROR_CODE: "GOD02"},
]
},
{
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"}
]
},
{
FIELD: "Region",
VALIDATION: []
},
{
FIELD: "City",
VALIDATION: []
},
{
FIELD: "Locality",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD06"},
{TYPE: MISSING, ERROR_CODE: "GOD07"}
]
}
],
},
GROWTH_MEDIA: {
"acronym": "GMD",
"id_field": "Acronym",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"},
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD01"},
{TYPE: MISSING, ERROR_CODE: "GMD02"}
]
},
{
FIELD: "Description",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD03"},
{TYPE: MISSING, ERROR_CODE: "GMD04"}
]
},
{
FIELD: "Full description",
VALIDATION: []
},
],
},
GENOMIC_INFO: {
"acronym": "GID",
"id_field": "Strain AN",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"},
COLUMNS: [
{
FIELD: "Strain AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID01"},
{TYPE: MISSING, ERROR_CODE: "GID02"},
{TYPE: CROSSREF, CROSSREF_NAME: "Strains",
ERROR_CODE: "GID03"},
]
},
{
FIELD: "Marker",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID04"},
{TYPE: MISSING, ERROR_CODE: "GID05"},
{TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"}
]
},
{
FIELD: "INSDC AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
]
},
{
FIELD: "Sequence",
VALIDATION: []
},
],
},
STRAINS: {
"acronym": "STD",
'id_field': 'Accession number',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [
{TYPE: NAGOYA, ERROR_CODE: "STD46"},
],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
"acronym": "LID",
'id_field': 'ID',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"},
ROW_VALIDATION: [
{TYPE: BIBLIO, ERROR_CODE: 'LID17'}
],
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID01"},
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID03"},
]
},
{
FIELD: "Authors",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID05"},
]
},
{
FIELD: "Title",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID07"},
]
},
{
FIELD: "Journal",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID09"},
]
},
{
FIELD: "Year",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID11"},
]
},
{
FIELD: "Volume",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID13"},
]
},
{
FIELD: "Issue",
VALIDATION: []
},
{
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
{TYPE: MISSING, ERROR_CODE: "LID16"},
]
},
{
FIELD: "Last page",
VALIDATION: []
},
{
FIELD: "Book title",
VALIDATION: []
},
{
FIELD: "Editors",
VALIDATION: []
},
{
FIELD: "Publisher",
VALIDATION: []
}
],
},
# SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []},
# RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []},
# FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []},
# PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []},
ONTOBIOTOPE: {
"acronym": "OTD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD01"},
{TYPE: MISSING, ERROR_CODE: "OTD02"},
]
},
{
FIELD: "Name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD03"},
{TYPE: MISSING, ERROR_CODE: "OTD04"},
]
},
]
},
MARKERS: {
"acronym": "MKD",
"id_field": "Acronym",
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: []
},
{
FIELD: "Marker",
VALIDATION: []
},
],
},
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID', 'Name'],
LITERATURE_SHEET: ['ID'],
LOCATIONS: ['Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["Accession number"],
SEXUAL_STATE_SHEET: [],
MARKERS: ["Acronym"],
}
MIRRI_20200601_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}