Compare commits

...

10 Commits

98 changed files with 1163 additions and 3534 deletions

3
.gitignore vendored
View File

@ -1,3 +0,0 @@
mirri/biolomics/secrets.py
.vscode/launch.json
*.pyc

View File

@ -1,34 +0,0 @@
# MIRRI Utils
## Installation
> pip install path_to_package.tar.gz
## Description
A small set of utilities to deal with Mirri Data.
- A data class to deal with strain data.
- An excel reader for mirri specification
- An excel validator for mirri specification
- An excel writer to create the excel with MIRRI specifications
## Update 06-09-2022
Under the bin directory:
bin\
upload_strains_to_mirri_is_NEWDB.py
validateNEW.py
those files are created to inserting the data from the excel files into the database mirridb.
validateNEW.py:
>the purpose of this file is to be the orchestator for the validations and the calling the upload to mirridb.
upload_strains_to_mirri_is_NEWDB.py:
This script is inserting the excel into the database, the code has comments regarding the steps.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,77 +0,0 @@
#!/usr/bin/env python3
import argparse
import sys
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-a', '--accession_number', required=True,
help='Delete the duplicated items in database for the given accession number')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
args = parser.parse_args()
return {'accession_number': args.accession_number, 'user': args.ws_user,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def main():
args = get_cmd_args()
out_fhand = sys.stdout
client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'])
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": args['accession_number']}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
result = client.search(STRAIN_WS, query=query)
total = result["total"]
if total == 0:
out_fhand.write('Accession not in database\n')
sys.exit(0)
return None
elif total == 1:
out_fhand.write('Accession is not duplicated\n')
sys.exit(0)
print(f'Duplicates found: {total}. removing duplicates')
duplicated_ids = [record.record_id for record in result['records']]
for duplicated_id in duplicated_ids[:-1]:
client.delete_by_id(STRAIN_WS, duplicated_id)
if __name__ == '__main__':
main()

View File

@ -1,91 +0,0 @@
#!/usr/bin/env python3
import argparse
import sys
from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS
from mirri.io.parsers.mirri_excel import parse_mirri_excel
from mirri.validation.excel_validator import validate_mirri_excel
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i', '--input', help='Validated Excel file',
type=argparse.FileType('rb'), required=True)
parser.add_argument('-v', '--spec_version', default='20200601',
help='Version of he specification of the given excel file')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
parser.add_argument('-f', '--force_update', required=False,
action='store_true',
help='Use it if you want to update the existing strains')
args = parser.parse_args()
return {'input_fhand': args.input, 'user': args.ws_user,
'version': args.spec_version,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret, 'update': args.force_update}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def main():
args = get_cmd_args()
input_fhand = args['input_fhand']
spec_version = args['version']
out_fhand = sys.stderr
error_log = validate_mirri_excel(input_fhand, version=spec_version)
errors = error_log.get_errors()
if errors:
write_errors_in_screen(errors, out_fhand)
sys.exit(1)
input_fhand.seek(0)
parsed_objects = parse_mirri_excel(input_fhand, version=spec_version)
strains = list(parsed_objects['strains'])
growth_media = list(parsed_objects['growth_media'])
client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'])
for gm in growth_media:
try:
client.delete_by_name(GROWTH_MEDIUM_WS, gm.acronym)
except ValueError as error:
print(error)
continue
print(f'Growth medium {gm.acronym} deleted')
for strain in strains:
ws_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id)
if ws_strain is not None:
client.delete_by_id(STRAIN_WS, ws_strain.record_id)
print(f'Strain {strain.id.strain_id} deleted')
else:
print(f'Strain {strain.id.strain_id} not in database')
if __name__ == '__main__':
main()

View File

@ -1,182 +0,0 @@
#!/usr/bin/env python3
import argparse
import sys
from collections import Counter
from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium
from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.io.parsers.mirri_excel import parse_mirri_excel
from mirri.validation.excel_validator import validate_mirri_excel
TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i', '--input', help='Validated Excel file',
type=argparse.FileType('rb'), required=True)
parser.add_argument('-v', '--spec_version', default='20200601',
help='Version of he specification of the given excel file')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
parser.add_argument('--force_update', required=False,
action='store_true',
help='Use it if you want to update the existing strains')
parser.add_argument('--verbose', action='store_true',
help='use it if you want a verbose output')
parser.add_argument('--prod', action='store_true',
help='Use production server')
parser.add_argument('--dont_add_gm', action='store_false',
help="Don't add growth media", default=True)
parser.add_argument('--dont_add_strains', action='store_false',
help="Don't add growth media", default=True)
parser.add_argument('--skip_first_num', type=int,
help='skip first X strains to the tool')
args = parser.parse_args()
return {'input_fhand': args.input, 'user': args.ws_user,
'version': args.spec_version,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret, 'update': args.force_update,
'verbose': args.verbose, 'use_production_server': args.prod,
'add_gm': args.dont_add_gm, 'add_strains': args.dont_add_strains,
'skip_first_num': args.skip_first_num}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def create_or_upload_strains(client, strains, update=False, counter=None,
out_fhand=None, seek=None):
for index, strain in enumerate(strains):
if seek is not None and index < seek:
continue
# if strain.id.strain_id != 'CECT 5766':
# continue
result = get_or_create_or_update_strain(client, strain, update=update)
new_strain = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n')
# break
def create_or_upload_growth_media(client, growth_media, update=False, counter=None,
out_fhand=None):
for gm in growth_media:
result = get_or_create_or_update_growth_medium(client, gm, update)
new_gm = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n')
def main():
args = get_cmd_args()
input_fhand = args['input_fhand']
spec_version = args['version']
out_fhand = sys.stdout
error_log = validate_mirri_excel(input_fhand, version=spec_version)
errors = error_log.get_errors()
skip_first_num = args['skip_first_num']
if errors:
write_errors_in_screen(errors, out_fhand)
sys.exit(1)
input_fhand.seek(0)
parsed_objects = parse_mirri_excel(input_fhand, version=spec_version)
strains = list(parsed_objects['strains'])
growth_media = list(parsed_objects['growth_media'])
server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL
client = BiolomicsMirriClient(server_url=server_url, api_version='v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'],
verbose=args['verbose'])
if args['add_gm']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_growth_media(client, growth_media, update=args['update'],
counter=counter, out_fhand=out_fhand)
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Growth media upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('Rolling back\n')
client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Growth Media', out_fhand)
if args['add_strains']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_strains(client, strains, update=args['update'],
counter=counter,
out_fhand=out_fhand, seek=skip_first_num)
client.finish_transaction()
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Strain upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('rolling back\n')
# client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Strains', out_fhand)
def show_stats(counter, kind, out_fhand):
out_fhand.write(f'{kind}\n')
line = ''.join(['-'] * len(kind))
out_fhand.write(f"{line}\n")
for kind2, value in counter.most_common(5):
out_fhand.write(f'{kind2}: {value}\n')
out_fhand.write('\n')
if __name__ == '__main__':
main()

View File

@ -1,224 +0,0 @@
#!/usr/bin/env python3
import argparse
from cmath import nan
import sys
from collections import Counter
#
from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium
from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.io.parsers.mirri_excel import parse_mirri_excel
from mirri.validation.excel_validator import validate_mirri_excel
##Database
from sqlalchemy import create_engine, MetaData
import pymysql
import pandas as pd
from pathlib import Path
import numpy as np
# DEFINE THE DATABASE CREDENTIALS
user = 'mirridev'
password = 'estramboticandolotodo'
host = 'mirri-is.mirri.org'
port = 33066
database = 'mirri-db'
TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri'
def show_stats(counter, kind, out_fhand):
out_fhand.write(f'{kind}\n')
line = ''.join(['-'] * len(kind))
out_fhand.write(f"{line}\n")
for kind2, value in counter.most_common(5):
out_fhand.write(f'{kind2}: {value}\n')
out_fhand.write('\n')
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i' , '--input', required=True, help='Validated Excel file', type=argparse.FileType('rb'))
parser.add_argument('-v' , '--spec_version', default='20200601', help='Version of he specification of the given excel file')
parser.add_argument('-u' , '--ws_user', help='Username of the web service')
parser.add_argument('-p' , '--ws_password', required=True, help='Password of the web service user')
parser.add_argument('-c' , '--client_id', required=True, help='Client id of the web service')
parser.add_argument('-s' , '--client_secret', required=True, help='Client secret of the web service')
parser.add_argument('--force_update' , required=False, action='store_true', help='Use it if you want to update the existing strains')
parser.add_argument('--verbose' , action='store_true', help='use it if you want a verbose output')
parser.add_argument('--prod' , action='store_true', help='Use production server')
parser.add_argument('--dont_add_gm' , default=True, action='store_false', help="Don't add growth media")
parser.add_argument('--dont_add_strains', default=True, action='store_false', help="Don't add growth media")
parser.add_argument('--skip_first_num' , type=int, help='skip first X strains to the tool')
args = parser.parse_args()
return {'input_fhand': args.input
,'user': args.ws_user
,'version': args.spec_version
,'password': args.ws_password
,'client_id': args.client_id
,'client_secret': args.client_secret
,'update': args.force_update
,'verbose': args.verbose
,'use_production_server': args.prod
,'add_gm': args.dont_add_gm
,'add_strains': args.dont_add_strains
,'skip_first_num': args.skip_first_num
}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def create_or_upload_strains(client, strains, update=False, counter=None, out_fhand=None, seek=None):
for index, strain in enumerate(strains):
if seek is not None and index < seek:
continue
# if strain.id.strain_id != 'CECT 5766':
# continue
result = get_or_create_or_update_strain(client, strain, update=update)
new_strain = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n')
# break
def create_or_upload_growth_media(client, growth_media, update=False, counter=None, out_fhand=None):
for gm in growth_media:
result = get_or_create_or_update_growth_medium(client, gm, update)
new_gm = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n')
def get_connection():
# PYTHON FUNCTION TO CONNECT TO THE MYSQL DATABASE AND
# RETURN THE SQLACHEMY ENGINE OBJECT
return create_engine(url="mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, database))
def main():
## Load Excel
path = Path('C://data//brclims_excel.xlsx')
Excel_Data = pd.read_excel(path, sheet_name = None)
cc_id=1
## Load Database
sqlEngine = get_connection()
engine = sqlEngine.connect()
## Create new file upload
with engine.connect() as conn:
metaDats = MetaData(conn, schema=database)
metaDats.reflect(bind=conn)
table = metaDats.tables['mirri-db.file_upload']
stmt = table.insert().values(filename=path.name,cc_id=cc_id)
aux=conn.execute(stmt).inserted_primary_key[0]
## Load all Sheet from Excel to DB
for key in Excel_Data.keys():
#print(key)
n=Excel_Data[key].replace(np.nan, '', regex=True).astype(str)
n.columns = n.columns.str.replace(' ','_')
n['f_id']=aux
n.to_sql(key, engine, index=False, if_exists='append')
"""
args = get_cmd_args()
input_fhand = args['input_fhand']
spec_version = args['version']
out_fhand = sys.stdout
error_log = validate_mirri_excel(input_fhand, version=spec_version)
errors = error_log.get_errors()
skip_first_num = args['skip_first_num']
if errors:
write_errors_in_screen(errors, out_fhand)
sys.exit(1)
input_fhand.seek(0)
parsed_objects = parse_mirri_excel(input_fhand, version=spec_version)
strains = list(parsed_objects['strains'])
growth_media = list(parsed_objects['growth_media'])
server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL
client = BiolomicsMirriClient(server_url=server_url, api_version='v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'],
verbose=args['verbose'])
if args['add_gm']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_growth_media(client, growth_media, update=args['update'],
counter=counter, out_fhand=out_fhand)
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Growth media upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('Rolling back\n')
client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Growth Media', out_fhand)
if args['add_strains']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_strains(client, strains, update=args['update'],
counter=counter,
out_fhand=out_fhand, seek=skip_first_num)
client.finish_transaction()
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Strain upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('rolling back\n')
# client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Strains', out_fhand)
"""
if __name__ == '__main__':
main()

View File

@ -1,21 +0,0 @@
#!/usr/bin/env python
import sys
from pathlib import Path
from mirri.validation.excel_validator import validate_mirri_excel
import warnings
warnings.simplefilter("ignore")
def main():
# path = Path(sys.argv[1])
path = Path( 'C:/data/brclims_excel.xlsx')
error_log = validate_mirri_excel(path.open("rb"))
for errors in error_log.get_errors().values():
for error in errors:
print(error.pk, error.message, error.code)
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -37,14 +37,14 @@ TRUEFALSE_TRANSLATOR = {
}
def parse_mirri_excel(fhand, version="20200601"):
if version == "20200601":
return _parse_mirri_v20200601(fhand)
def parse_mirri_excel(fhand, version=""):
if version == "5.1.2":
return _parse_mirri_v12052023(fhand)
else:
raise NotImplementedError("Only version 20200601 is implemented")
raise NotImplementedError("Only version is 5.1.2 implemented")
def _parse_mirri_v20200601(fhand):
def _parse_mirri_v12052023(fhand):
fhand.seek(0)
file_content = BytesIO(fhand.read())
wb = load_workbook(filename=file_content, read_only=True, data_only=True)
@ -64,7 +64,6 @@ def _parse_mirri_v20200601(fhand):
return {"strains": strains, "growth_media": growth_media}
def index_list_by(list_, id_):
return {str(item[id_]): item for item in list_}
@ -125,7 +124,7 @@ def parse_strains(wb, locations, growth_media, markers, publications,
publications = index_list_by_attr(publications, 'id')
markers = index_markers(markers)
for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"):
for strain_row in workbook_sheet_reader(wb, STRAINS, "accessionNumber"):
strain = StrainMirri()
strain_id = None
label = None
@ -202,8 +201,18 @@ def parse_strains(wb, locations, growth_media, markers, publications,
items = value.split(";")
strain.collect.location.latitude = float(items[0])
strain.collect.location.longitude = float(items[1])
strain.collect.location.precision = float(items[2])
strain.collect.location.altitude = float(items[3])
if len(items) > 4:
strain.collect.location.coord_uncertainty = items[4]
elif attribute == "collect.site.links":
items = value.split(";")
strain.collect.site.links.nameSite = str(items[0])
strain.collect.site.links.urlSite = str(items[1])
rsetattr(strain, attribute, value.split(";")) #ver o separador
if len(items) > 2:
strain.collect.location.coord_uncertainty = items[2]
strain.collect.site.links.site_uncertainty = items[2]
elif attribute == "collect.location":
location = locations[value]

View File

@ -50,11 +50,10 @@ PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS]
def write_mirri_excel(path, strains, growth_media, version):
if version == "20200601":
_write_mirri_excel_20200601(path, strains, growth_media)
if version == "5.1.2":
_write_mirri_excel_12052023(path, strains, growth_media)
def _write_mirri_excel_20200601(path, strains, growth_media):
def _write_mirri_excel_12052023(path, strains, growth_media):
wb = Workbook()
write_markers_sheet(wb)
@ -104,7 +103,7 @@ def _write_mirri_excel_20200601(path, strains, growth_media):
redimension_cell_width(pub_sheet)
# write sexual states
sex_sheet = wb.create_sheet("Sexual states")
sex_sheet = wb.create_sheet("Sexual state")
for sex_state in sorted(list(sexual_states)):
sex_sheet.append([sex_state])
redimension_cell_width(sex_sheet)
@ -121,7 +120,6 @@ def _write_mirri_excel_20200601(path, strains, growth_media):
del wb["Sheet"]
wb.save(str(path))
def _deserialize_strains(strains, locations, growth_media_indexes,
publications, sexual_states, genomic_markers):
for strain in strains:
@ -189,8 +187,19 @@ def _deserialize_strains(strains, locations, growth_media_indexes,
elif attribute == "collect.location.coords":
lat = strain.collect.location.latitude
long = strain.collect.location.longitude
if lat is not None and long is not None:
value = f"{lat};{long}"
alt = strain.collect.location.altitude
prec = strain.collect.location.precision
if lat is not None and long is not None and prec is not None and alt is not None:
value = f"{lat};{long};{prec};{alt}"
else:
value = None
elif attribute == "collect.site.links":
name = strain.collect.site.links.nameSite
url = strain.collect.site.links.urlSite
value = rgetattr(strain, attribute)
value = ";".join(value)
if name is not None and url is not None:
value = f"{name};{url}"
else:
value = None

View File

@ -1,50 +0,0 @@
from mirri import rgetattr
def validate_strain(strain, version='20200601'):
if version == '20200601':
return _validate_strain_v20200601(strain)
raise NotImplementedError('Only v20200601 is implemented')
def _validate_strain_v20200601(strain):
mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'},
{'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'},
{'label': 'Restriction on use', 'attr': 'restriction_on_use'},
{'label': 'Risk group', 'attr': 'risk_group'},
{'label': 'Organism type', 'attr': 'taxonomy.organism_type'},
{'label': 'Taxon name', 'attr': 'taxonomy.long_name'},
{'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'},
{'label': 'Recommended media', 'attr': 'growth.recommended_media'},
{'label': 'Form of supply', 'attr': 'form_of_supply'},
{'label': 'Country', 'attr': 'collect.location.country'}]
errors = []
for mandatory in mandatory_attrs:
value = rgetattr(strain, mandatory['attr'])
if value is None:
errors.append(f"{mandatory['label']} is mandatory field")
if not is_valid_nagoya(strain):
errors.append('Not compliant wih nagoya protocol requirements')
return errors
def is_valid_nagoya(strain):
# nagoya_requirements
_date = strain.collect.date
if _date is None:
_date = strain.isolation.date
if _date is None:
_date = strain.deposit.date
if _date is None:
_date = strain.catalog_inclusion_date
# print(_date)
year = None if _date is None else _date._year
if year is not None and year >= 2014 and strain.collect.location.country is None:
return False
return True

View File

@ -1,414 +0,0 @@
from typing import Optional
class ErrorMessage():
"""Error message
Args:
code (str): Error code.
pk (str | optional): The instance's primary key that triggered the error. Defaults to None.
value (str | optional): The instance's value that triggered the error. Defaults to None.
"""
def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None):
self.code = code.upper()
self.pk = pk
self.value = value
@property
def _codes(self) -> list:
return [
func
for func in dir(self)
if func.isupper() and
callable(getattr(self, func)) and
not func.startswith("__")
]
@property
def _messages(self) -> dict:
return {code: getattr(self, code) for code in self._codes}
@property
def message(self) -> str:
if not self._validate_code():
raise ValueError(f"{self.code} not found")
return self._messages[self.code]()
@property
def code(self) -> str:
return self._code
@code.setter
def code(self, code: str) -> None:
self._code = code.upper()
def _validate_code(self) -> bool:
return self.code in self._codes
@property
def pk(self) -> str:
return self._pk
@pk.setter
def pk(self, pk: str) -> None:
self._pk = pk
@property
def value(self) -> str:
return self._value
@value.setter
def value(self, value: str) -> None:
self._value = value
"""
Excel File Structure Error Codes
"""
def EXL00(self):
return f"The provided file '{self.pk}' is not an excel(xlsx) file"
def EFS01(self):
return "The 'Growth media' sheet is missing. Please check the provided excel template."
def EFS02(self):
return "The 'Geographic origin' sheet is missing. Please check the provided excel template."
def EFS03(self):
return "The 'Literature' sheet is missing. Please check the provided excel template."
def EFS04(self):
return "The 'Sexual state' sheet is missing. Please check the provided excel template."
def EFS05(self):
return "The 'Strains' sheet is missing. Please check the provided excel template."
def EFS06(self):
return "The 'Ontobiotope' sheet is missing. Please check the provided excel template."
def EFS07(self):
return "The 'Markers' sheet is missing. Please check the provided excel template."
def EFS08(self):
return "The 'Genomic information' sheet is missing. Please check the provided excel template."
"""
Growth Media Error Codes
"""
def GMD01(self):
return "The 'Acronym' column is a mandatory field in the Growth Media sheet."
def GMD02(self):
return "The 'Acronym' column is empty or has missing values."
def GMD03(self):
return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty."
def GMD04(self):
return f"The 'Description' for growth media with Acronym {self.pk} is missing."
"""
Geographic Origin Error Codes
"""
def GOD01(self):
return "The 'ID' column is a mandatory field in the Geographic Origin sheet."
def GOD02(self):
return "The 'ID' column is empty or has missing values."
def GOD03(self):
return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD04(self):
return f"The 'Country' for geographic origin with ID {self.pk} is missing."
def GOD05(self):
return f"The 'Country' for geographic origin with ID {self.pk} is incorrect."
def GOD06(self):
return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD07(self):
return f"The 'Locality' for geographic origin with ID {self.pk} is missing."
"""
Literature Error Codes
"""
def LID01(self):
return "The 'ID' column is a mandatory field in the Literature sheet."
def LID02(self):
return "The 'ID' column empty or missing values."
def LID03(self):
return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID04(self):
return f"The 'Full reference' for literature with ID {self.pk} is missing."
def LID05(self):
return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID06(self):
return f"The 'Authors' for literature with ID {self.pk} is missing."
def LID07(self):
return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID08(self):
return f"The 'Title' for literature with ID {self.pk} is missing."
def LID09(self):
return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID10(self):
return f"The 'Journal' for literature with ID {self.pk} is missing."
def LID11(self):
return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID12(self,):
return f"The 'Year' for literature with ID {self.pk} is missing."
def LID13(self):
return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID14(self):
return f"The 'Volume' for literature with ID {self.pk} is missing."
def LID15(self):
return "The 'First page' column is a mandatory field. The column can not be empty."
def LID16(self):
return f"The 'First page' for literature with ID {self.pk} is missing."
def LID17(self):
msg = 'If journal; Title, Authors, journal, year and first page are required'
msg += 'If Book; Book Title, Authors, Year, Editors, Publishers'
return msg
"""
Strains Error Codes
"""
def STD01(self):
return "The 'Accession number' column is a mandatory field in the Strains sheet."
def STD02(self):
return "The 'Accession number' column is empty or has missing values."
def STD03(self):
return f"The 'Accesion number' must be unique. The '{self.value}' is repeated."
def STD04(self):
return (f"The 'Accession number' {self.pk} is not according to the specification."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD05(self):
return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD06(self):
return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing."
def STD07(self):
return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD08(self):
return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD09(self):
return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing."
def STD10(self):
return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD11(self):
return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD12(self):
return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD13(self):
return f"The 'Risk group' for strain with Accession Number {self.pk} is missing."
def STD14(self):
return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.")
def STD15(self):
return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD16(self):
return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD17(self):
return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD18(self):
return f"The 'Organism type' for strain with Accession Number {self.pk} is missing."
def STD19(self):
return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', "
"'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.")
def STD20(self):
return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD21(self):
return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing."
def STD22(self):
return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect."
def STD23(self):
return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD24(self):
return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect."
def STD25(self):
return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD26(self):
return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD27(self):
return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD28(self):
return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD29(self):
return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','")
def STD30(self):
return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD31(self):
return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing."
def STD32(self):
return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','.")
def STD33(self):
return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD34(self):
return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing."
def STD35(self):
return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet."
def STD36(self):
return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD37(self):
return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing."
def STD38(self):
return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet."
def STD39(self):
return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be"
"between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.")
def STD40(self):
return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are one decimal number between [-200, 8000].")
def STD41(self):
return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet."
def STD42(self):
return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2")
def STD43(self):
return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', "
"'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'")
def STD44(self):
return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9")
def STD45(self):
msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. "
msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage"
return msg
def STD46(self):
msg = f"If date of collection/isolation/deposit/inclusion in the catalog is after 2014," \
f" the value of column Geographic Origin must be provided and associated with a country in the " \
f"Geographic Origin sheet. The value is missing or not associated with a country for strain {self.pk}."
return msg
"""
Genomic Information Error Codes
"""
def GID01(self):
return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet."
def GID02(self):
return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values."
def GID03(self):
return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet."
def GID04(self):
return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID05(self):
return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing."
def GID06(self):
return f"The value of 'Marker' {self.value} is not in the Markers sheet."
def GID07(self):
return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID08(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing."
def GID09(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect."
def GID10(self):
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
" It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.")
"""
Ontobiotope Error Codes
"""
def OTD01(self):
return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet."
def OTD02(self):
return "The 'ID' columns is empty or has missing values."
def OTD03(self):
return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty."
def OTD04(self):
return f"The 'Name' for ontobiotope with ID {self.pk} is missing."

View File

@ -1,5 +0,0 @@
openpyxl
requests
requests_oauthlib
pycountry
deepdiff

View File

@ -3,6 +3,7 @@ from pathlib import Path
DATA_DIR = Path(__file__).parent / "data"
ACCESSION_NUMBER = "accession_number"
MIRRI_ACCESSION_NUMBER = 'mirri_accession_number'
RESTRICTION_ON_USE = "restriction_on_use"
NAGOYA_PROTOCOL = "nagoya_protocol"
ABS_RELATED_FILES = "abs_related_files"
@ -14,6 +15,7 @@ DUAL_USE = "dual_use"
QUARANTINE = "quarantine"
ORGANISM_TYPE = "organism_type"
TAXON_NAME = "taxon_name"
TYPE = "type"
INFRASUBSPECIFIC_NAME = "infrasubspecific_names"
COMMENTS_ON_TAXONOMY = "comments_on_taxonomy"
STATUS = "status"
@ -54,6 +56,9 @@ SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation"
ISOLATION_HABITAT = "isolation_habitat"
ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat"
LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome"
AXENIC_CULTURE = "axenic_culture"
QPS ="qps"
SITE_LINK = "site_links"
# StrainId
STRAIN_ID = "id"
@ -99,73 +104,80 @@ ALLOWED_COLLECTING_SITE_KEYS = [
]
MIRRI_FIELDS = [
{"attribute": "id", "label": "Accession number"},
{"attribute": "restriction_on_use", "label": "Restrictions on use"},
{"attribute": "id", "label": "accessionNumber"},
{"attribute": "mirri_accession_number", "label": "mirriAccessionNumber"},
{"attribute": "qps", "label": "qps"},
{"attribute": "axenic_culture", "label": "axenicCulture"},
{"attribute": "restriction_on_use", "label": "useRestrictions"},
{"attribute": "nagoya_protocol",
"label": "Nagoya protocol restrictions and compliance conditions"},
{"attribute": ABS_RELATED_FILES, "label": "ABS related files"},
{"attribute": "mta_files", "label": "MTA file"},
{"attribute": "other_numbers", "label": "Other culture collection numbers"},
"label": "nagoyaConditions"},
{"attribute": ABS_RELATED_FILES, "label": "absFile"},
{"attribute": "mta_files", "label": "mtaFile"},
{"attribute": "other_numbers", "label": "otherCollectionNumbers"},
{"attribute": "is_from_registered_collection",
"label": "Strain from a registered collection"},
{"attribute": "risk_group", "label": "Risk Group"},
{"attribute": "is_potentially_harmful", "label": "Dual use"},
{"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"},
{"attribute": "taxonomy.organism_type", "label": "Organism type"},
{"attribute": "taxonomy.taxon_name", "label": "Taxon name"},
"label": "registeredCollection"},
{"attribute": "risk_group", "label": "riskGroup"},
{"attribute": "is_potentially_harmful", "label": "dualUse"},
{"attribute": "is_subject_to_quarantine", "label": "euQuarantine"},
{"attribute": "taxonomy.organism_type", "label": "organismType"},
{"attribute": "taxonomy.taxon_name", "label": "speciesName"},
{"attribute": "taxonomy.infrasubspecific_name",
"label": "Infrasubspecific names"},
{"attribute": "taxonomy.comments", "label": "Comment on taxonomy"},
"label": "infrasubspecificNames"},
{"attribute": "taxonomy.comments", "label": "taxonomyComments"},
{"attribute": "taxonomy.interspecific_hybrid",
"label": "Interspecific hybrid"},
{"attribute": "status", "label": "Status"},
{"attribute": "history", "label": "History of deposit", },
{"attribute": "deposit.who", "label": "Depositor"},
{"attribute": "deposit.date", "label": "Date of deposit"},
"label": "hybrid"},
{"attribute": "status", "label": "status"},
{"attribute": "history", "label": "depositHistory", },
{"attribute": "deposit.who", "label": "depositor"},
{"attribute": "deposit.date", "label": "depositDate"},
{"attribute": "catalog_inclusion_date",
"label": "Date of inclusion in the catalogue"},
{"attribute": "collect.who", "label": "Collected by"},
{"attribute": "collect.date", "label": "Date of collection"},
{"attribute": "isolation.who", "label": "Isolated by"},
{"attribute": "isolation.date", "label": "Date of isolation"},
"label": "accessionDate"},
{"attribute": "collect.who", "label": "collector"},
{"attribute": "collect.date", "label": "collectionDate"},
{"attribute": "isolation.who", "label": "isolator"},
{"attribute": "isolation.date", "label": "isolationDate"},
{"attribute": "isolation.substrate_host_of_isolation",
"label": "Substrate/host of isolation"},
"label": "substrate"},
{"attribute": "growth.tested_temp_range",
"label": "Tested temperature growth range"},
"label": "temperatureGrowthRange"},
{"attribute": "growth.recommended_temp",
"label": "Recommended growth temperature"},
"label": "recommendedTemperature"},
{"attribute": "growth.recommended_media",
"label": "Recommended medium for growth"},
{"attribute": "form_of_supply", "label": "Form of supply"},
{"attribute": "other_denominations", "label": "Other denomination"},
"label": "recommendedMedium"},
{"attribute": "form_of_supply", "label": "supplyForms"},
{"attribute": "other_denominations", "label": "otherDenomination"},
{"attribute": "collect.location.coords",
"label": "Coordinates of geographic origin"},
"label": "geographicCoordinates"},
{"attribute": "collect.site.links",
"label": "siteLinks"},
{"attribute": "collect.location.altitude",
"label": "Altitude of geographic origin"},
{"attribute": "collect.location", "label": "Geographic origin"},
{"attribute": "collect.habitat", "label": "Isolation habitat"},
"label": "country"},
{"attribute": "collect.location", "label": "geographicOrigin"},
{"attribute": "collect.habitat", "label": "isolationHabitat"},
{"attribute": "collect.habitat_ontobiotope",
"label": "Ontobiotope term for the isolation habitat"},
{"attribute": "genetics.gmo", "label": "GMO"},
"label": "ontobiotopeTerms"},
{"attribute": "genetics.gmo", "label": "gmo"},
{"attribute": "genetics.gmo_construction",
"label": "GMO construction information"},
{"attribute": "genetics.mutant_info", "label": "Mutant information"},
{"attribute": "genetics.genotype", "label": "Genotype"},
{"attribute": "genetics.sexual_state", "label": "Sexual state"},
{"attribute": "genetics.ploidy", "label": "Ploidy"},
{"attribute": "genetics.plasmids", "label": "Plasmids"},
"label": "gmoConstruction"},
{"attribute": "genetics.mutant_info", "label": "mutant"},
{"attribute": "genetics.genotype", "label": "genotype"},
{"attribute": "genetics.sexual_state", "label": "sexualState"},
{"attribute": "genetics.ploidy", "label": "ploidy"},
{"attribute": "genetics.plasmids", "label": "plasmids"},
{"attribute": "genetics.plasmids_in_collections",
"label": "Plasmids collections fields"},
{"attribute": "publications", "label": "Literature"},
"label": "plasmidCollections"},
{"attribute": "publications", "label": "identificationLiterature"},
{"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"},
{"attribute": "pathogenicity", "label": "Pathogenicity"},
{"attribute": "enzyme_production", "label": "Enzyme production"},
{"attribute": "pathogenicity", "label": "pathogenicity"},
{"attribute": "enzyme_production", "label": "enzymes"},
{"attribute": "production_of_metabolites",
"label": "Production of metabolites"},
{"attribute": "applications", "label": "Applications", },
{"attribute": "remarks", "label": "Remarks"},
"label": "metabolites"},
{"attribute": "type",
"label": "type"},
{"attribute": "applications", "label": "applications", },
{"attribute": "remarks", "label": "remarks"},
{"attribute": LITERATURE_LINKED_TO_SEQ_GENOME,
"label": "Literature linked to the sequence/genome"},
"label": "sequenceLiterature"},
]
ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma",
@ -228,8 +240,9 @@ ALLOWED_MARKER_TYPES = [
]
PUBLICATIONS = "publications"
PUB_ID = "id"
PUB_ID = "pub_id"
PUB_DOI = "pub_doi"
PUB_PMID = "pub_pmid"
PUB_PUBMED_ID = ''
PUB_FULL_REFERENCE = "full_reference"
PUB_TITLE = "title"
@ -247,6 +260,8 @@ BOOK_PUBLISHER = "book_publisher"
PUBLICATION_FIELDS = [
{"label": "ID", "attribute": PUB_ID},
{"label": "PMID", "attribute": PUB_PMID},
{"label": "DOI", "attribute": PUB_DOI},
{"label": "Full reference", "attribute": PUB_FULL_REFERENCE},
{"label": "Authors", "attribute": PUB_AUTHORS},
{"label": "Title", "attribute": PUB_TITLE},
@ -282,15 +297,43 @@ SUBTAXAS = {
"f.sp.": "forma.specialis"
}
#Control
VERSION = "Version"
DATE = "Date"
#Country codes
COUNTRY = "Country"
CODE = "Code"
ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE = "Additional information on the country or code"
#Country codes files
COUNTRY_CODES_SHEET = [
{"label": "Country", "attribute": COUNTRY},
{"label": "Code", "attribute": CODE},
{"label": "Additional information on the country or code", "attribute": ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE},
]
#Controle files
CONTROL_FIELDS = [
{"label": "Version", "attribute": VERSION},
{"label": "Date", "attribute": DATE},
]
# Excel sheet name
LOCATIONS = "Geographic origin" # 'Locations'
GROWTH_MEDIA = "Growth media"
GENOMIC_INFO = "Genomic information"
STRAINS = "Strains"
LITERATURE_SHEET = "Literature"
SEXUAL_STATE_SHEET = "Sexual states"
SEXUAL_STATE_SHEET = "Sexual state"
RESOURCE_TYPES_VALUES = "Resource types values"
FORM_OF_SUPPLY_SHEET = "Forms of supply"
PLOIDY_SHEET = "Ploidy"
ONTOBIOTOPE = "Ontobiotope"
MARKERS = "Markers"
CONTROL_SHEET = "Version"
COUNTRY_CODES_SHEET = "Country codes"
RESOURCE_SHEET = 'Resource types values'

View File

@ -1,35 +0,0 @@
import setuptools
from pathlib import Path
from setuptools import find_packages
with open("README.md", "r") as fh:
long_description = fh.read()
requirements = [line.strip() for line in open('requirements.txt')]
scripts = [str(f) for f in Path('./bin').glob('*.py')]
setuptools.setup(
name="Mirri utils", # Replace with your own username
version=0.1,
author="P.Ziarsolo",
author_email="pziarsolo@gmail.com",
description="A small library to help dealing with MIRRI data",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/pziarsolo/mirri_utils",
packages=find_packages(),
package_data={"mirri": ['data/ontobiotopes.csv']},
# package_dir={"mirri.entities": "mirri.entities"
# "mirri.io.parsers": "mirri.io.parsers",
# "mirri.io.writers": "mirri.io.writers",
# 'mirri.validation': 'mirri.vallidation'},
install_requires=requirements,
scripts=scripts,
license="GNU General Public License v3.0",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)

View File

View File

@ -1,22 +0,0 @@
import unittest
from mirri.biolomics.remote.rest_client import BiolomicsClient
try:
from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
except ImportError:
raise ImportError(
'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD')
from .utils import VERSION, SERVER_URL
class BiolomicsClientAuthTest(unittest.TestCase):
def test_authentication(self):
client = BiolomicsClient(SERVER_URL, VERSION, CLIENT_ID, SECRET_ID,
USERNAME, PASSWORD)
access1 = client.get_access_token()
access2 = client.get_access_token()
assert access1 is not None
self.assertEqual(access1, access2)

View File

@ -1,62 +0,0 @@
import unittest
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS
from mirri.biolomics.serializers.growth_media import GrowthMedium
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from tests.biolomics.utils import SERVER_URL, VERSION
class BiolomicsSequenceClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_media_by_id(self):
record_id = 101
growth_medium = self.client.retrieve_by_id('growth_medium', record_id)
self.assertEqual(growth_medium.record_id, record_id)
self.assertEqual(growth_medium.record_name, 'MA2PH6')
def test_retrieve_media_by_id(self):
record_name = 'MA2PH6'
record_id = 101
growth_medium = self.client.retrieve_by_name('growth_medium', record_name)
self.assertEqual(growth_medium.record_id, record_id)
self.assertEqual(growth_medium.record_name, record_name)
def test_create_growth_media(self):
self.client.start_transaction()
try:
growth_medium = GrowthMedium()
growth_medium.acronym = 'BBB'
growth_medium.ingredients = 'alkhdflakhf'
growth_medium.description = 'desc'
new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium)
print(new_growth_medium.dict())
finally:
self.client.rollback()
def test_update_growth_media(self):
self.client.start_transaction()
try:
growth_medium = GrowthMedium()
growth_medium.acronym = 'BBB'
growth_medium.ingredients = 'alkhdflakhf'
growth_medium.description = 'desc'
growth_medium.full_description = 'full'
new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium)
new_growth_medium.full_description = 'full2'
updated_gm = new_growth_medium = self.client.update(GROWTH_MEDIUM_WS, new_growth_medium)
self.assertEqual(updated_gm.full_description, new_growth_medium.full_description)
retrieved = self.client.retrieve_by_id(GROWTH_MEDIUM_WS, new_growth_medium.record_id)
self.assertEqual(retrieved.full_description, updated_gm.full_description)
finally:
self.client.rollback()

View File

@ -1,46 +0,0 @@
import unittest
from .utils import VERSION, SERVER_URL
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS
from mirri.entities.publication import Publication
class BiolomicsLiteratureClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_biblio_by_id(self):
record_id = 100
record_name = "Miscellaneous notes on Mucoraceae"
biblio = self.client.retrieve_by_id(BIBLIOGRAPHY_WS, record_id)
self.assertEqual(biblio.record_id, record_id)
self.assertEqual(biblio.record_name, record_name)
def test_retrieve_media_by_id(self):
record_id = 100
record_name = "Miscellaneous notes on Mucoraceae"
biblio = self.client.retrieve_by_name(BIBLIOGRAPHY_WS, record_name)
self.assertEqual(biblio.record_id, record_id)
self.assertEqual(biblio.record_name, record_name)
self.assertEqual(biblio.year, 1994)
self.assertEqual(biblio.volume, '50')
def test_create_biblio(self):
pub = Publication()
pub.pubmed_id = 'PM18192'
pub.journal = 'my_journal'
pub.title = 'awesome title'
pub.authors = 'pasdas, aposjdasd, alsalsfda'
pub.volume = 'volume 0'
record_id = None
try:
new_pub = self.client.create(BIBLIOGRAPHY_WS, pub)
record_id = new_pub.record_id
self.assertEqual(new_pub.title, pub.title)
self.assertEqual(new_pub.volume, pub.volume)
finally:
if record_id is not None:
self.client.delete_by_id(BIBLIOGRAPHY_WS, record_id)

View File

@ -1,49 +0,0 @@
import unittest
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics
from .utils import VERSION, SERVER_URL
class BiolomicsSequenceClientTest(unittest.TestCase):
def setUp(self) -> None:
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_seq_by_id(self):
record_id = 101
sequence = self.client.retrieve_by_id('sequence', record_id)
self.assertEqual(sequence.record_id, record_id)
self.assertEqual(sequence.record_name, 'MUM 02.54 - CaM')
self.assertEqual(sequence.marker_type, 'CaM')
def test_retrieve_seq_by_name(self):
record_name = 'MUM 02.54 - CaM'
sequence = self.client.retrieve_by_name('sequence', record_name)
self.assertEqual(sequence.record_id, 101)
self.assertEqual(sequence.record_name, record_name)
self.assertEqual(sequence.marker_type, 'CaM')
def test_create_delete_sequence(self):
marker = GenomicSequenceBiolomics()
marker.marker_id = 'GGAAUUA'
marker.marker_seq = 'aattgacgat'
marker.marker_type = 'CaM'
marker.record_name = 'peioMarker'
new_marker = self.client.create('sequence', marker)
self.assertEqual(new_marker.marker_id, 'GGAAUUA')
self.assertEqual(new_marker.marker_seq, 'aattgacgat')
self.assertEqual(new_marker.marker_type, 'CaM')
self.assertEqual(new_marker.record_name, 'peioMarker')
self.assertTrue(new_marker.record_id)
self.client.delete_by_id('sequence', new_marker.record_id)
if __name__ == "__main__":
# import sys;sys.argv = ['', 'BiolomicsClient.Test.test_get_strain_by_id']
unittest.main()

View File

@ -1,727 +0,0 @@
import unittest
import pycountry
import deepdiff
from pprint import pprint
from mirri.biolomics.serializers.sequence import (
GenomicSequenceBiolomics,
serialize_to_biolomics as sequence_to_biolomics,
serialize_from_biolomics as sequence_from_biolomics)
from mirri.biolomics.serializers.strain import (
serialize_to_biolomics as strain_to_biolomics,
serialize_from_biolomics as strain_from_biolomics)
from mirri.biolomics.serializers.growth_media import (
# serialize_to_biolomics as growth_medium_to_biolomics,
serialize_from_biolomics as growth_medium_from_biolomics)
from mirri.biolomics.serializers.bibliography import (
serializer_from_biolomics as literature_from_biolomics,
serializer_to_biolomics as literature_to_biolomics
)
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.entities.publication import Publication
from .utils import create_full_data_strain, VERSION, SERVER_URL
STRAIN_WS = {
'CreationDate': '2021-05-19T12:22:33',
'CreatorUserName': 'pziarsolo@cect.org',
'LastChangeDate': '2021-05-19T12:22:36',
'LastChangeUserName': 'pziarsolo@cect.org',
'RecordDetails': {'ABS related files': {'FieldType': 21,
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Altitude of geographic origin': {'FieldType': 4,
'Value': 121.0},
'Applications': {'FieldType': 5, 'Value': 'health'},
'Catalog URL': {'FieldType': 21, 'Value': []},
'Collection accession number': {'FieldType': 5,
'Value': 'TESTCC 1'},
'Collection date': {'FieldType': 8, 'Value': '1991/01/01'},
'Collector': {'FieldType': 5, 'Value': 'the collector'},
'Comment on taxonomy': {'FieldType': 5,
'Value': 'lalalalla'},
'Coordinates of geographic origin': {'FieldType': 12,
'Value': {'Altitude': 0.0,
'Latitude': 23.3,
'Longitude': 23.3,
'Precision': 0.0}},
'Country': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'Spain'},
'RecordId': 54,
'TargetFieldValue': None}]},
'Data provided by': {'FieldType': 22, 'Value': 'Unknown'},
'Date of inclusion in the catalogue': {'FieldType': 8,
'Value': '1985/05/02'},
'Deposit date': {'FieldType': 8, 'Value': '1985/05/02'},
'Depositor': {'FieldType': 5,
'Value': 'NCTC, National Collection of Type '
'Cultures - NCTC, London, United '
'Kingdom of Great Britain and '
'Northern Ireland.'},
'Dual use': {'FieldType': 20, 'Value': 'yes'},
'Enzyme production': {'FieldType': 5,
'Value': 'some enzimes'},
'Form': {'FieldType': 3,
'Value': [{'Name': 'Agar', 'Value': 'yes'},
{'Name': 'Cryo', 'Value': 'no'},
{'Name': 'Dry Ice', 'Value': 'no'},
{'Name': 'Liquid Culture Medium',
'Value': 'no'},
{'Name': 'Lyo', 'Value': 'yes'},
{'Name': 'Oil', 'Value': 'no'},
{'Name': 'Water', 'Value': 'no'}]},
'GMO': {'FieldType': 22, 'Value': 'Yes'},
'GMO construction information': {'FieldType': 5,
'Value': 'instructrion to '
'build'},
'Genotype': {'FieldType': 5, 'Value': 'some genotupe'},
'Geographic origin': {'FieldType': 5,
'Value': 'una state; one '
'municipality; somewhere in '
'the world'},
'History': {'FieldType': 5,
'Value': 'newer < In the middle < older'},
'Infrasubspecific names': {'FieldType': 5,
'Value': 'serovar tete'},
'Interspecific hybrid': {'FieldType': 20, 'Value': 'no'},
'Isolation date': {'FieldType': 8, 'Value': '1900/01/01'},
'Isolation habitat': {'FieldType': 5,
'Value': 'some habitat'},
'Isolator': {'FieldType': 5, 'Value': 'the isolator'},
'Literature': {'FieldType': 118, 'Value': []},
'MTA files URL': {'FieldType': 21,
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'MTA text': {'FieldType': 5, 'Value': ''},
'Metabolites production': {'FieldType': 5,
'Value': 'big factory of cheese'},
'Mutant information': {'FieldType': 5, 'Value': 'x-men'},
'Nagoya protocol restrictions and compliance conditions': {'FieldType': 20,
'Value': 'no '
'known '
'restrictions '
'under '
'the '
'Nagoya '
'protocol'},
'Ontobiotope': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'anaerobic '
'bioreactor '
'(OBT:000190)'},
'RecordId': 100,
'TargetFieldValue': None}]},
'Ontobiotope term for the isolation habitat': {'FieldType': 5,
'Value': ''},
'Orders': {'FieldType': 118, 'Value': []},
'Organism type': {'FieldType': 3,
'Value': [{'Name': 'Algae', 'Value': 'no'},
{'Name': 'Archaea',
'Value': 'yes'},
{'Name': 'Bacteria',
'Value': 'no'},
{'Name': 'Cyanobacteria',
'Value': 'no'},
{'Name': 'Filamentous Fungi',
'Value': 'no'},
{'Name': 'Phage', 'Value': 'no'},
{'Name': 'Plasmid',
'Value': 'no'},
{'Name': 'Virus', 'Value': 'no'},
{'Name': 'Yeast', 'Value': 'no'},
{'Name': 'Microalgae',
'Value': '?'}]},
'Other culture collection numbers': {'FieldType': 5,
'Value': 'aaa a; aaa3 '
'a3'},
'Other denomination': {'FieldType': 5, 'Value': ''},
'Pathogenicity': {'FieldType': 5, 'Value': 'illness'},
'Plasmids': {'FieldType': 5, 'Value': 'asda'},
'Plasmids collections fields': {'FieldType': 5,
'Value': 'asdasda'},
'Ploidy': {'FieldType': 20, 'Value': 'Polyploid'},
'Quarantine in Europe': {'FieldType': 20, 'Value': 'no'},
'Recommended growth medium': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'AAA'},
'RecordId': 1,
'TargetFieldValue': None}]},
'Recommended growth temperature': {'FieldType': 19,
'MaxValue': 30.0,
'MinValue': 30.0},
'Remarks': {'FieldType': 5, 'Value': 'no remarks for me'},
'Restrictions on use': {'FieldType': 20,
'Value': 'no restriction apply'},
'Risk group': {'FieldType': 20, 'Value': '1'},
'Sequences 16s': {"Value": [
{
"Name": {
"Value": "X76436",
"FieldType": 5
},
"RecordId": 50992,
"TargetFieldValue": {
"Value": {
"Sequence": ""
},
"FieldType": 14
}
}
],
"FieldType": 114},
'Sequences 18S rRNA': {'FieldType': 114, 'Value': []},
'Sequences 23S rRNA': {'FieldType': 114, 'Value': []},
'Sequences ACT': {'FieldType': 114, 'Value': []},
'Sequences AmdS': {'FieldType': 114, 'Value': []},
'Sequences Amds12': {'FieldType': 114, 'Value': []},
'Sequences Beta tubulin': {'FieldType': 114, 'Value': []},
'Sequences COX1': {'FieldType': 114, 'Value': []},
'Sequences COX2': {'FieldType': 114, 'Value': []},
'Sequences CaM': {'FieldType': 114, 'Value': []},
'Sequences Cct8': {'FieldType': 114, 'Value': []},
'Sequences Cit1': {'FieldType': 114, 'Value': []},
'Sequences CypA': {'FieldType': 114, 'Value': []},
'Sequences GDP': {'FieldType': 114, 'Value': []},
'Sequences GPD': {'FieldType': 114, 'Value': []},
'Sequences Genome': {'FieldType': 114, 'Value': []},
'Sequences HIS': {'FieldType': 114, 'Value': []},
'Sequences HSP': {'FieldType': 114, 'Value': []},
'Sequences IDH': {'FieldType': 114, 'Value': []},
'Sequences IGS': {'FieldType': 114, 'Value': []},
'Sequences ITS': {'FieldType': 114, 'Value': []},
'Sequences LSU': {'FieldType': 114, 'Value': []},
'Sequences MAT': {'FieldType': 114, 'Value': []},
'Sequences MAT1': {'FieldType': 114, 'Value': []},
'Sequences Miscellaneous': {'FieldType': 114, 'Value': []},
'Sequences NorA': {'FieldType': 114, 'Value': []},
'Sequences NorB': {'FieldType': 114, 'Value': []},
'Sequences Omt12': {'FieldType': 114, 'Value': []},
'Sequences OmtA': {'FieldType': 114, 'Value': []},
'Sequences PcCYP': {'FieldType': 114, 'Value': []},
'Sequences PpgA': {'FieldType': 114, 'Value': []},
'Sequences PreA': {'FieldType': 114, 'Value': []},
'Sequences PreB': {'FieldType': 114, 'Value': []},
'Sequences RAPD': {'FieldType': 114, 'Value': []},
'Sequences RPB1': {'FieldType': 114, 'Value': []},
'Sequences RPB2': {'FieldType': 114, 'Value': []},
'Sequences SSU': {'FieldType': 114, 'Value': []},
'Sequences TEF1a': {'FieldType': 114, 'Value': []},
'Sequences TEF2': {'FieldType': 114, 'Value': []},
'Sequences TUB': {'FieldType': 114, 'Value': []},
'Sequences Tsr1': {'FieldType': 114, 'Value': []},
'Sequences c16S rRNA': {'FieldType': 114, 'Value': []},
'Sequences cbhI': {'FieldType': 114, 'Value': []},
'Sequences mcm7': {'FieldType': 114, 'Value': []},
'Sequences rbcL': {'FieldType': 114, 'Value': []},
'Sexual state': {'FieldType': 5, 'Value': 'MT+A'},
'Status': {'FieldType': 5,
'Value': 'type of Bacillus alcalophilus'},
'Strain from a registered collection': {'FieldType': 20,
'Value': 'no'},
'Substrate of isolation': {'FieldType': 5,
'Value': 'some substrate'},
'Taxon name': {'FieldType': 109,
'Value': [{'Name': {'FieldType': 5,
'Value': 'Escherichia '
'coli'},
'RecordId': 100004123,
'TargetFieldValue': {'DesktopInfo': None,
'DesktopInfoHtml': '<b>Current '
'name: '
'</b><i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'FieldType': 27,
'NewSynFieldInfo': None,
'ObligateSynonymId': 0,
'OriginalSynFieldInfo': None,
'SynInfo': {'BasionymRecord': {'NameInfo': '',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'CurrentNameRecord': {'NameInfo': '',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'ObligateSynonymRecords': [],
'SelectedRecord': {
'NameInfo': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'TaxonSynonymsRecords': []},
'SynonymId': 100004123}}]},
'Tested temperature growth range': {'FieldType': 19,
'MaxValue': 32.0,
'MinValue': 29.0},
'Type description': {'FieldType': 5, 'Value': ''}},
'RecordId': 148038,
'RecordName': 'MIRRI 2240561'}
STRAIN_WS_EXPECTED_NO_REMOTE = {
'Acronym': 'MIRRI',
'RecordDetails': {'ABS related files': {'FieldType': 'U',
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Altitude of geographic origin': {'FieldType': 'D',
'Value': 121},
'Applications': {'FieldType': 'E', 'Value': 'health'},
'Collection accession number': {'FieldType': 'E',
'Value': 'TESTCC 1'},
'Collection date': {'FieldType': 'H', 'Value': '1991-01-01'},
'Collector': {'FieldType': 'E', 'Value': 'the collector'},
'Comment on taxonomy': {'FieldType': 'E',
'Value': 'lalalalla'},
'Coordinates of geographic origin': {'FieldType': 'L',
'Value': {'Latitude': 23.3,
'Longitude': 23.3}},
'Date of inclusion in the catalogue': {'FieldType': 'H',
'Value': '1985-05-02'},
'Deposit date': {'FieldType': 'H', 'Value': '1985-05-02'},
'Depositor': {'FieldType': 'E',
'Value': 'NCTC, National Collection of Type '
'Cultures - NCTC, London, United '
'Kingdom of Great Britain and '
'Northern Ireland.'},
'Dual use': {'FieldType': 'T', 'Value': 'yes'},
'Enzyme production': {'FieldType': 'E',
'Value': 'some enzimes'},
'Form': {'FieldType': 'C',
'Value': [{'Name': 'Agar', 'Value': 'yes'},
{'Name': 'Cryo', 'Value': 'no'},
{'Name': 'Dry Ice', 'Value': 'no'},
{'Name': 'Liquid Culture Medium',
'Value': 'no'},
{'Name': 'Lyo', 'Value': 'yes'},
{'Name': 'Oil', 'Value': 'no'},
{'Name': 'Water', 'Value': 'no'}]},
'GMO': {'FieldType': 'V', 'Value': 'Yes'},
'GMO construction information': {'FieldType': 'E',
'Value': 'instructrion to '
'build'},
'Genotype': {'FieldType': 'E', 'Value': 'some genotupe'},
'Geographic origin': {'FieldType': 'E',
'Value': 'una state; one '
'municipality; somewhere in '
'the world'},
'History': {'FieldType': 'E',
'Value': 'firstplave < seconn place < third '
'place'},
'Infrasubspecific names': {'FieldType': 'E',
'Value': 'serovar tete'},
'Interspecific hybrid': {'FieldType': 'T', 'Value': 'no'},
'Isolation date': {'FieldType': 'H', 'Value': '1900-01-01'},
'Isolation habitat': {'FieldType': 'E',
'Value': 'some habitat'},
'Isolator': {'FieldType': 'E', 'Value': 'the isolator'},
'MTA files URL': {'FieldType': 'U',
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Metabolites production': {'FieldType': 'E',
'Value': 'big factory of cheese'},
'Mutant information': {'FieldType': 'E', 'Value': 'x-men'},
'Nagoya protocol restrictions and compliance conditions': {'FieldType': 'T',
'Value': 'no '
'known '
'restrictions '
'under '
'the '
'Nagoya '
'protocol'},
'Ontobiotope': {'FieldType': 'RLink', 'Value': 'OBT:000190'},
'Organism type': {'FieldType': 'C',
'Value': [{'Name': 'Algae', 'Value': 'no'},
{'Name': 'Archaea',
'Value': 'yes'},
{'Name': 'Bacteria',
'Value': 'no'},
{'Name': 'Cyanobacteria',
'Value': 'no'},
{'Name': 'Filamentous Fungi',
'Value': 'no'},
{'Name': 'Phage', 'Value': 'no'},
{'Name': 'Plasmid',
'Value': 'no'},
{'Name': 'Virus', 'Value': 'no'},
{'Name': 'Yeast',
'Value': 'no'}]},
'Other culture collection numbers': {'FieldType': 'E',
'Value': 'aaa a; aaa3 '
'a3'},
'Pathogenicity': {'FieldType': 'E', 'Value': 'illness'},
'Plasmids': {'FieldType': 'E', 'Value': 'asda'},
'Plasmids collections fields': {'FieldType': 'E',
'Value': 'asdasda'},
'Ploidy': {'FieldType': 'T', 'Value': 'Polyploid'},
'Quarantine in Europe': {'FieldType': 'T', 'Value': 'no'},
'Recommended growth temperature': {'FieldType': 'S',
'MaxValue': 30.0,
'MinValue': 30.0},
'Remarks': {'FieldType': 'E', 'Value': 'no remarks for me'},
'Restrictions on use': {'FieldType': 'T',
'Value': 'no restriction apply'},
'Risk group': {'FieldType': 'T', 'Value': '1'},
'Sexual state': {'FieldType': 'E', 'Value': 'MT+A'},
'Status': {'FieldType': 'E',
'Value': 'type of Bacillus alcalophilus'},
'Strain from a registered collection': {'FieldType': 'T',
'Value': 'no'},
'Substrate of isolation': {'FieldType': 'E',
'Value': 'some substrate'},
'Taxon name': {'FieldType': 'SynLink',
'Value': 'Escherichia coli'},
'Tested temperature growth range': {'FieldType': 'S',
'MaxValue': 32.0,
'MinValue': 29.0}}}
class StrainSerializerTest(unittest.TestCase):
def test_serialize_to_biolomics(self):
strain = create_full_data_strain()
ws_strain = strain_to_biolomics(strain, client=None)
self.assertDictEqual(ws_strain, STRAIN_WS_EXPECTED_NO_REMOTE)
def test_serialize_to_biolomics_remote(self):
client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
strain = create_full_data_strain()
marker = GenomicSequenceBiolomics()
marker.marker_id = "MUM 02.15 - Beta tubulin"
marker.marker_type = 'TUBB'
strain.genetics.markers = [marker]
ws_strain = strain_to_biolomics(strain, client=client)
self.assertEqual(strain.collect.habitat_ontobiotope,
ws_strain['RecordDetails']['Ontobiotope']['Value'][0]['Name']['Value'])
self.assertEqual(pycountry.countries.get(alpha_3=strain.collect.location.country).name,
ws_strain['RecordDetails']['Country']['Value'][0]['Name']['Value'])
self.assertEqual(strain.publications[0].title,
ws_strain['RecordDetails']['Literature']['Value'][0]['Name']['Value'])
self.assertEqual(strain.genetics.markers[0].marker_id,
ws_strain['RecordDetails']['Sequences TUB']['Value'][0]['Name']['Value'])
def test_serialize_from_biolomics(self):
ws_strain = STRAIN_WS
strain = strain_from_biolomics(ws_strain)
self.assertEqual(strain.record_id, 148038)
self.assertEqual(strain.record_name, 'MIRRI 2240561')
self.assertEqual(strain.taxonomy.long_name, 'Escherichia coli')
self.assertEqual(strain.growth.recommended_media, ['AAA'])
self.assertEqual(strain.collect.location.altitude, 121)
self.assertEqual(strain.collect.location.country, 'ESP')
self.assertEqual(strain.applications, 'health')
self.assertEqual(strain.id.strain_id, 'TESTCC 1')
self.assertEqual(strain.collect.date.strfdate, '19910101')
self.assertEqual(strain.taxonomy.comments, 'lalalalla')
self.assertEqual(strain.catalog_inclusion_date.strfdate, '19850502')
self.assertIn('NCTC, National Collection of Type ', strain.deposit.who)
self.assertTrue(strain.is_potentially_harmful)
self.assertEqual(strain.form_of_supply, ['Agar', 'Lyo'])
self.assertTrue(strain.genetics.gmo)
self.assertEqual(strain.genetics.gmo_construction, 'instructrion to build')
self.assertEqual(strain.genetics.genotype, 'some genotupe')
self.assertEqual(strain.history, ['newer', 'In the middle', 'older'])
self.assertEqual(strain.taxonomy.infrasubspecific_name, 'serovar tete')
self.assertEqual(strain.isolation.who, 'the isolator')
self.assertEqual(strain.isolation.date.strfdate, '19000101')
self.assertEqual(strain.mta_files, ['https://example.com'])
self.assertEqual(strain.genetics.mutant_info, 'x-men')
self.assertEqual(strain.collect.habitat_ontobiotope, 'OBT:000190')
self.assertEqual(strain.taxonomy.organism_type[0].name, 'Archaea')
self.assertEqual(strain.other_numbers[0].strain_id, 'aaa a')
self.assertEqual(strain.other_numbers[1].strain_id, 'aaa3 a3')
self.assertEqual(strain.pathogenicity, 'illness')
self.assertEqual(strain.genetics.plasmids, ['asda'])
self.assertEqual(strain.genetics.ploidy, 9)
self.assertFalse(strain.is_subject_to_quarantine)
self.assertEqual(strain.risk_group, '1')
self.assertFalse(strain.is_from_registered_collection)
self.assertEqual(strain.growth.tested_temp_range, {'min': 29, 'max': 32})
BIOLOMICSSEQ = {
'RecordDetails': {
'Barcode level': {'FieldType': 20, 'Value': 'undefined'},
'DNA extract number': {'FieldType': 5, 'Value': ''},
'DNA sequence': {'FieldType': 14,
'Value': {'Sequence': 'caaaggaggccttctccctcttcgtaag'}},
'Editing state': {'FieldType': 20, 'Value': 'Auto import'},
'Forward primer(s)': {'FieldType': 5, 'Value': ''},
'Genbank': {'FieldType': 21, 'Value': []},
'INSDC number': {'FieldType': 5, 'Value': 'AATGAT'},
'Literature': {'FieldType': 21, 'Value': []},
'Literature1': {'FieldType': 118, 'Value': []},
'Marker name': {'FieldType': 5, 'Value': 'CaM'},
'Privacy': {'FieldType': 20, 'Value': 'undefined'},
'Quality': {'FieldType': 5, 'Value': ''},
'Remarks': {'FieldType': 5, 'Value': ''},
'Reverse primer(s)': {'FieldType': 5, 'Value': ''},
'Review state': {'FieldType': 5, 'Value': ''},
'Strain number': {'FieldType': 5, 'Value': 'MUM 02.54'}},
'RecordId': 101,
'RecordName': 'MUM 02.54 - CaM'}
class SequenceSerializerTest(unittest.TestCase):
def test_from_biolomics(self):
marker = sequence_from_biolomics(BIOLOMICSSEQ)
self.assertEqual(marker.record_name, BIOLOMICSSEQ['RecordName'])
self.assertEqual(marker.record_id, BIOLOMICSSEQ['RecordId'])
self.assertEqual(marker.marker_type, BIOLOMICSSEQ['RecordDetails']['Marker name']['Value'])
self.assertEqual(marker.marker_id, BIOLOMICSSEQ['RecordDetails']['INSDC number']['Value'])
self.assertEqual(marker.marker_seq, BIOLOMICSSEQ['RecordDetails']['DNA sequence']['Value']['Sequence'])
def test_to_biolomics(self):
marker = GenomicSequenceBiolomics()
marker.marker_id = 'GGAAUUA'
marker.marker_seq = 'aattgacgat'
marker.marker_type = 'CaM'
marker.record_name = 'peioMarker'
marker.record_id = 111
ws_seq = sequence_to_biolomics(marker)
expected = {'RecordId': marker.record_id,
'RecordName': marker.record_name,
'RecordDetails': {
'INSDC number': {'Value': marker.marker_id, 'FieldType': 'E'},
'DNA sequence': {'Value': {'Sequence': marker.marker_seq}, 'FieldType': 'N'},
'Marker name': {'Value': marker.marker_type, 'FieldType': 'E'}}}
self.assertEqual(ws_seq, expected)
BIOLOMICS_MEDIUM = {
"RecordId": 100,
"RecordName": "MA20S",
"RecordDetails": {
"Full description": {
"Value": "mout agar+20% saccharose",
"FieldType": 5
},
"Ingredients": {
"Value": "Malt extract\r\n\tDilute brewery malt with water to 10% sugar solution (level 10 on Brix saccharose meter), 15 minutes at 121 C\r\nsaccharose\t200g\r\ndistilled water\t0.6l\r\nagar\t15g\r\n",
"FieldType": 5
},
"Link to full description": {
"Value": [],
"FieldType": 21
},
"Medium description": {
"Value": "",
"FieldType": 5
},
"Other name": {
"Value": "",
"FieldType": 5
},
"pH": {
"Value": "7 with KOH",
"FieldType": 5
},
"Remarks": {
"Value": "",
"FieldType": 5
},
"Reference": {
"Value": "",
"FieldType": 5
},
"Sterilization conditions": {
"Value": "15 minutes at 121 C",
"FieldType": 5
}
}
}
class MediumSerializerTest(unittest.TestCase):
def test_from_biolomics(self):
medium = growth_medium_from_biolomics(BIOLOMICS_MEDIUM)
self.assertEqual(medium.record_id, BIOLOMICS_MEDIUM['RecordId'])
self.assertEqual(medium.record_name, BIOLOMICS_MEDIUM['RecordName'])
self.assertEqual(medium.ingredients, BIOLOMICS_MEDIUM['RecordDetails']['Ingredients']['Value'])
self.assertEqual(medium.full_description, BIOLOMICS_MEDIUM['RecordDetails']['Full description']['Value'])
self.assertEqual(medium.ph, BIOLOMICS_MEDIUM['RecordDetails']['pH']['Value'])
BIOLOMICS_BIBLIOGRAPHY = {
"RecordId": 100,
"RecordName": "Miscellaneous notes on Mucoraceae",
"RecordDetails": {
"Associated strains": {
"Value": [],
"FieldType": 118
},
"Associated taxa": {
"Value": [],
"FieldType": 118
},
"Authors": {
"Value": "Schipper, M.A.A.; Samson, R.A.",
"FieldType": 5
},
"Associated sequences": {
"Value": [],
"FieldType": 118
},
"Abstract": {
"Value": "",
"FieldType": 5
},
"Collection": {
"Value": "",
"FieldType": 5
},
"DOI number": {
"Value": "",
"FieldType": 5
},
"Editor(s)": {
"Value": "",
"FieldType": 5
},
"Full reference": {
"Value": "",
"FieldType": 5
},
"Hyperlink": {
"Value": [],
"FieldType": 21
},
"ISBN": {
"Value": "",
"FieldType": 5
},
"ISSN": {
"Value": "",
"FieldType": 5
},
"Issue": {
"Value": "",
"FieldType": 5
},
"Journal": {
"Value": "Mycotaxon",
"FieldType": 5
},
"Journal-Book": {
"Value": "",
"FieldType": 5
},
"Keywords": {
"Value": "",
"FieldType": 5
},
"Page from": {
"Value": "475",
"FieldType": 5
},
"Page to": {
"Value": "491",
"FieldType": 5
},
"Publisher": {
"Value": "",
"FieldType": 5
},
"PubMed ID": {
"Value": "",
"FieldType": 5
},
"Volume": {
"Value": "50",
"FieldType": 5
},
"Year": {
"Value": 1994,
"FieldType": 4
}
}
}
class BibliographySerializerTest(unittest.TestCase):
def test_from_biolomics(self):
pub = literature_from_biolomics(BIOLOMICS_BIBLIOGRAPHY)
self.assertEqual(pub.record_name, "Miscellaneous notes on Mucoraceae")
self.assertEqual(pub.record_id, 100)
self.assertEqual(pub.year, 1994)
self.assertEqual(pub.authors, "Schipper, M.A.A.; Samson, R.A.")
def test_to_biolomics(self):
pub = Publication()
pub.title = 'My title'
pub.year = 1992
pub.authors = 'me and myself'
pub.pubmed_id = '1112222'
pub.issue = 'issue'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'Authors': {'FieldType': 'E', 'Value': 'me and myself'},
'PubMed ID': {'FieldType': 'E', 'Value': '1112222'},
'Issue': {'FieldType': 'E', 'Value': 'issue'},
'Year': {'FieldType': 'D', 'Value': 1992}},
'RecordName': 'My title'}
self.assertDictEqual(expected, ws_data)
def test_to_biolomics2(self):
pub = Publication()
pub.pubmed_id = '1112222'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}},
'RecordName': f'PUBMED:{pub.pubmed_id}'}
self.assertDictEqual(expected, ws_data)
pub = Publication()
pub.doi = 'doi.er/111/12131'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'DOI number': {'FieldType': 'E', 'Value': pub.doi}},
'RecordName': f'DOI:{pub.doi}'}
self.assertDictEqual(expected, ws_data)
if __name__ == "__main__":
import sys;
sys.argv = ['', 'BibliographySerializerTest']
unittest.main()

View File

@ -1,156 +0,0 @@
import unittest
from mirri.biolomics.remote.endoint_names import STRAIN_WS
from .utils import VERSION, SERVER_URL, create_full_data_strain
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number
class BiolomicsStrainClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_strain_by_id(self):
record_id = 14803
strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(strain.record_id, record_id)
print(strain.record_name)
def test_retrieve_strain_by_name(self):
record_id = 14803
record_name = 'MIRRI0014803'
strain = self.client.retrieve_by_name(STRAIN_WS, record_name)
self.assertEqual(strain.record_name, record_name)
self.assertEqual(strain.record_id, record_id)
def test_search_strain(self):
accession_number = "BEA 0014B"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
self.assertEqual(search_response['total'], 1)
self.assertEqual(search_response['records'][0].id.strain_id,
accession_number)
def test_search_strain4(self):
accession_number = "TESTCC 1"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
for strain in search_response['records']:
print(strain)
self.client.delete_by_id(STRAIN_WS, strain.record_id)
def test_search_strain_no_found(self):
accession_number = "BEA 0014B_"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
self.assertEqual(search_response['total'], 0)
self.assertFalse(search_response['records'])
def test_create_strain(self):
strain = create_full_data_strain()
strain.taxonomy.interspecific_hybrid = None
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertIsNone(new_strain.taxonomy.interspecific_hybrid)
self.assertEqual(new_strain.growth.recommended_media, ['AAA'])
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
finally:
if record_id is not None:
self.client.delete_by_id(STRAIN_WS, record_id)
def test_update_strain(self):
strain = create_full_data_strain()
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
self.assertFalse(new_strain.taxonomy.interspecific_hybrid)
new_strain.id.number = '2'
new_strain.taxonomy.interspecific_hybrid = None
updated_strain = self.client.update(STRAIN_WS, new_strain)
self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(updated_strain.taxonomy.interspecific_hybrid)
retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(retrieved_strain.taxonomy.interspecific_hybrid)
finally:
if record_id is not None:
print('deleting')
self.client.delete_by_id(STRAIN_WS, record_id)
def test_update_strain_pathogenicity(self):
strain = create_full_data_strain()
print(strain.pathogenicity)
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
self.assertEqual(new_strain.pathogenicity, 'illness')
new_strain.pathogenicity = None
updated_strain = self.client.update(STRAIN_WS, new_strain)
self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(updated_strain.pathogenicity)
retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(retrieved_strain.pathogenicity)
finally:
if record_id is not None:
self.client.delete_by_id(STRAIN_WS, record_id)
def test_search_by_accession_number(self):
accession_number = "BEA 0014B"
strain = retrieve_strain_by_accession_number(self.client, accession_number)
self.assertEqual(strain.id.strain_id, accession_number)
def test_search_by_accession_number(self):
accession_number = "BEA 0014B_"
strain = retrieve_strain_by_accession_number(self.client, accession_number)
self.assertFalse(strain)
class BiolomicsClientGrowthMediaTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def xtest_growth_media_by_name(self):
gm = self.client.retrieve('growth_media', 'AAA')
self.assertEqual(gm['Record Id'], 1)
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'BiolomicsWriter.test_mirri_excel_parser_invalid']
unittest.main()

View File

@ -1,99 +0,0 @@
from mirri.biolomics.serializers.strain import StrainMirri
from mirri.entities.strain import StrainId, OrganismType
from mirri.entities.sequence import GenomicSequence
from mirri.entities.date_range import DateRange
from mirri.entities.publication import Publication
from mirri.settings import NAGOYA_NO_RESTRICTIONS
VERSION = 'v2'
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def create_full_data_strain():
strain = StrainMirri()
strain.id.number = "1"
strain.id.collection = "TESTCC"
strain.id.url = "https://cect/2342"
strain.restriction_on_use = "no_restriction"
strain.nagoya_protocol = NAGOYA_NO_RESTRICTIONS
strain.abs_related_files = ['https://example.com']
strain.mta_files = ['https://example.com']
strain.other_numbers.append(StrainId(collection="aaa", number="a"))
strain.other_numbers.append(StrainId(collection="aaa3", number="a3"))
strain.is_from_registered_collection = False
strain.risk_group = '1'
strain.is_potentially_harmful = True
strain.is_subject_to_quarantine = False
strain.taxonomy.organism_type = [OrganismType(2)]
strain.taxonomy.genus = 'Escherichia'
strain.taxonomy.species = 'coli'
strain.taxonomy.interspecific_hybrid = False
strain.taxonomy.infrasubspecific_name = 'serovar tete'
strain.taxonomy.comments = 'lalalalla'
strain.status = "type of Bacillus alcalophilus"
strain.history = 'firstplave < seconn place < third place'
strain.deposit.who = "NCTC, National Collection of Type Cultures - NCTC, London, United Kingdom of Great Britain and Northern Ireland."
strain.deposit.date = DateRange(year=1985, month=5, day=2)
strain.catalog_inclusion_date = DateRange(year=1985, month=5, day=2)
strain.collect.location.country = "ESP"
strain.collect.location.state = "una state"
strain.collect.location.municipality = "one municipality"
strain.collect.location.longitude = 23.3
strain.collect.location.latitude = 23.3
strain.collect.location.altitude = 121
strain.collect.location.site = "somewhere in the world"
strain.collect.habitat_ontobiotope = "OBT:000190"
strain.collect.habitat = 'some habitat'
strain.collect.who = "the collector"
strain.collect.date = DateRange(year=1991)
strain.isolation.date = DateRange(year=1900)
strain.isolation.who = 'the isolator'
strain.isolation.substrate_host_of_isolation = 'some substrate'
# already existing media in test_mirri
strain.growth.recommended_temp = {'min': 30, 'max': 30}
strain.growth.recommended_media = ["AAA"]
strain.growth.tested_temp_range = {'min': 29, 'max': 32}
strain.form_of_supply = ["Agar", "Lyo"]
#strain.other_denominations = ["lajdflasjdldj"]
gen_seq = GenomicSequence()
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
strain.genetics.markers.append(gen_seq)
strain.genetics.ploidy = 9
strain.genetics.genotype = 'some genotupe'
strain.genetics.gmo = True
strain.genetics.gmo_construction = 'instructrion to build'
strain.genetics.mutant_info = 'x-men'
strain.genetics.sexual_state = 'MT+A'
strain.genetics.plasmids = ['asda']
strain.genetics.plasmids_in_collections = ['asdasda']
pub = Publication()
pub.title = "The genus Amylomyces"
strain.publications = [pub]
strain.plant_pathogenicity_code = 'PATH:001'
strain.pathogenicity = 'illness'
strain.enzyme_production = 'some enzimes'
strain.production_of_metabolites = 'big factory of cheese'
strain.applications = 'health'
strain.remarks = 'no remarks for me'
return strain
if __name__ == '__main__':
strain = create_full_data_strain()
print(strain.collect.habitat_ontobiotope)

View File

@ -1,5 +0,0 @@
{
"key1": "value1",
"key2": "value2",
"key3": "value3"
}

Binary file not shown.

Binary file not shown.

View File

@ -1,318 +0,0 @@
"""
Created on 2020(e)ko abe. 2(a)
@author: peio
"""
import unittest
from mirri.entities.publication import Publication
from mirri.entities.date_range import DateRange
from mirri.entities.location import Location
from mirri.entities.sequence import GenomicSequence
from mirri.entities.strain import (
Collect,
Deposit,
Isolation,
ValidationError,
OrganismType,
Strain,
StrainId,
Taxonomy,
)
from mirri.settings import (
COLLECT,
COUNTRY,
DATE_OF_ISOLATION,
DEPOSIT,
DEPOSITOR,
GENETICS,
GROWTH,
ISOLATED_BY,
ISOLATION,
LOCATION,
MARKERS,
NAGOYA_DOCS_AVAILABLE,
NAGOYA_PROTOCOL,
ORGANISM_TYPE,
OTHER_CULTURE_NUMBERS,
PLOIDY,
RECOMMENDED_GROWTH_MEDIUM,
TAXONOMY,
DATE_OF_INCLUSION, NO_RESTRICTION
)
from mirri.validation.entity_validators import validate_strain
class TestDataRange(unittest.TestCase):
def test_data_range_init(self):
dr = DateRange()
self.assertFalse(dr)
self.assertEqual(dr.__str__(), "")
self.assertEqual(dr.range["start"], None)
self.assertEqual(dr.range["end"], None)
dr.strpdate("2012")
self.assertEqual(dr.strfdate, "2012----")
self.assertTrue(dr)
dr.strpdate("2012----")
self.assertEqual(dr.strfdate, "2012----")
dr.strpdate("201212--")
self.assertEqual(dr.strfdate, "201212--")
try:
dr.strpdate("201213--")
self.fail()
except ValueError:
pass
try:
dr = DateRange(year=2012, month=13)
self.fail()
except ValueError:
pass
dr = DateRange(year=2020)
self.assertEqual(dr.strfdate, "2020----")
dr2 = dr.strpdate("2012")
self.assertEqual(dr2.range["start"].year, 2012)
self.assertEqual(dr2.range["start"].month, 1)
self.assertEqual(dr2.range["start"].day, 1)
self.assertEqual(dr2.range["end"].year, 2012)
self.assertEqual(dr2.range["end"].month, 12)
self.assertEqual(dr2.range["end"].day, 31)
class TestCollect(unittest.TestCase):
def test_collect_basic(self):
collect = Collect()
self.assertEqual(collect.dict(), {})
collect.location.country = "ESP"
collect.date = DateRange().strpdate("2012----")
collect.who = "pepito"
self.assertEqual(
dict(collect.dict()),
{
"location": {"countryOfOriginCode": "ESP"},
"collected_by": "pepito",
"date_of_collection": "2012----",
},
)
self.assertEqual(collect.__str__(),
"Collected: Spain in 2012---- by pepito")
class TestOrganismType(unittest.TestCase):
def test_basic_usage(self):
org_type = OrganismType(2)
self.assertEqual(org_type.name, "Archaea")
self.assertEqual(org_type.code, 2)
try:
org_type.ko = 'a'
self.fail()
except TypeError:
pass
org_type = OrganismType("Archaea")
class TestTaxonomy(unittest.TestCase):
def test_taxonomy_basic(self):
taxonomy = Taxonomy()
self.assertEqual(taxonomy.dict(), {})
self.assertFalse(taxonomy)
def test_taxonomy_with_data(self):
taxonomy = Taxonomy()
taxonomy.genus = "Bacilus"
taxonomy.organism_type = [OrganismType("Archaea")]
taxonomy.species = "vulgaris"
self.assertEqual(taxonomy.long_name, "Bacilus vulgaris")
# print(taxonomy.dict())
class TestLocation(unittest.TestCase):
def test_empty_init(self):
loc = Location()
self.assertEqual(loc.dict(), {})
self.assertFalse(loc)
def test_add_data(self):
loc = Location()
loc.country = "esp"
self.assertEqual(loc.dict(), {COUNTRY: "esp"})
loc.state = None
self.assertEqual(loc.dict(), {COUNTRY: "esp"})
class TestStrain(unittest.TestCase):
def test_empty_strain(self):
strain = Strain()
self.assertEqual(strain.dict(), {})
def test_strain_add_data(self):
strain = Strain()
strain.id.number = "5433"
strain.id.collection = "CECT"
strain.id.url = "https://cect/2342"
try:
strain.nagoya_protocol = "asdas"
self.fail()
except ValidationError:
pass
strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE
strain.dict()[NAGOYA_PROTOCOL] = NAGOYA_DOCS_AVAILABLE
strain.collect.location.country = "ESP"
self.assertEqual(strain.dict()[COLLECT][LOCATION][COUNTRY], "ESP")
strain.genetics.ploidy = 9
self.assertEqual(strain.dict()[GENETICS][PLOIDY], 9)
strain.growth.recommended_media = ["asd"]
strain.isolation.date = DateRange(year=1900)
self.assertEqual(strain.dict()[ISOLATION]
[DATE_OF_ISOLATION], "1900----")
strain.deposit.who = "pepe"
self.assertEqual(strain.dict()[DEPOSIT][DEPOSITOR], "pepe")
strain.growth.recommended_media = ["11"]
self.assertEqual(strain.dict()[GROWTH]
[RECOMMENDED_GROWTH_MEDIUM], ["11"])
strain.taxonomy.organism_type = [OrganismType(2)]
self.assertEqual(
strain.dict()[TAXONOMY][ORGANISM_TYPE], [
{"code": 2, "name": "Archaea"}]
)
strain.taxonomy.organism_type = [OrganismType("Algae")]
self.assertEqual(
strain.dict()[TAXONOMY][ORGANISM_TYPE], [
{"code": 1, "name": "Algae"}]
)
strain.other_numbers.append(StrainId(collection="aaa", number="a"))
strain.other_numbers.append(StrainId(collection="aaa3", number="a3"))
self.assertEqual(
strain.dict()[OTHER_CULTURE_NUMBERS],
[
{"collection_code": "aaa", "accession_number": "a"},
{"collection_code": "aaa3", "accession_number": "a3"},
],
)
strain.form_of_supply = ["Agar", "Lyo"]
gen_seq = GenomicSequence()
self.assertEqual(gen_seq.dict(), {})
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
strain.genetics.markers.append(gen_seq)
self.assertEqual(
strain.dict()[GENETICS][MARKERS],
[{"marker_type": "16S rRNA", "INSDC": "pepe"}],
)
strain.collect.habitat_ontobiotope = "OBT:111111"
self.assertEqual(strain.collect.habitat_ontobiotope, "OBT:111111")
try:
strain.collect.habitat_ontobiotope = "OBT:11111"
self.fail()
except ValidationError:
pass
# publications
try:
strain.publications = 1
self.fail()
except ValidationError:
pass
pub = Publication()
pub.id = "1"
try:
strain.publications = pub
self.fail()
except ValidationError:
pass
strain.publications = [pub]
self.assertEqual(strain.publications[0].id, "1")
strain.catalog_inclusion_date = DateRange(year=1992)
self.assertEqual(strain.dict()[DATE_OF_INCLUSION], '1992----')
import pprint
pprint.pprint(strain.dict())
def test_strain_validation(self):
strain = Strain()
strain.form_of_supply = ['Lyo']
return
errors = validate_strain(strain)
self.assertEqual(len(errors), 10)
strain.id.collection = 'test'
strain.id.number = '1'
errors = validate_strain(strain)
self.assertEqual(len(errors), 9)
strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE
strain.restriction_on_use = NO_RESTRICTION
strain.risk_group = 1
strain.taxonomy.organism_type = [OrganismType(4)]
strain.taxonomy.hybrids = ['Sac lac', 'Sac lcac3']
strain.growth.recommended_media = ['aa']
strain.growth.recommended_temp = {'min': 2, 'max':5}
strain.form_of_supply = ['lyo']
strain.collect.location.country = 'ESP'
errors = validate_strain(strain)
self.assertFalse(errors)
class TestIsolation(unittest.TestCase):
def test_iniatialize_isollation(self):
isolation = Isolation()
self.assertEqual(isolation.dict(), {})
isolation.who = "pepito"
self.assertTrue(ISOLATED_BY in isolation.dict())
isolation.date = DateRange().strpdate("2012----")
self.assertTrue(DATE_OF_ISOLATION in isolation.dict())
try:
isolation.location.site = "spain"
self.fail()
except (ValueError, AttributeError):
pass
class TestGenomicSequence(unittest.TestCase):
def test_empty_init(self):
gen_seq = GenomicSequence()
self.assertEqual(gen_seq.dict(), {})
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
self.assertEqual(gen_seq.dict(), {
"marker_type": "16S rRNA", "INSDC": "pepe"})
if __name__ == "__main__":
# import sys;sys.argv = ['', 'TestStrain']
unittest.main()

View File

@ -1,51 +0,0 @@
from mirri.entities.strain import ValidationError
import unittest
from pathlib import Path
from pprint import pprint
from mirri.io.parsers.mirri_excel import parse_mirri_excel
TEST_DATA_DIR = Path(__file__).parent / "data"
class MirriExcelTests(unittest.TestCase):
def test_mirri_excel_parser(self):
in_path = TEST_DATA_DIR / "valid.mirri.xlsx"
with in_path.open("rb") as fhand:
parsed_data = parse_mirri_excel(fhand, version="20200601")
medium = parsed_data["growth_media"][0]
self.assertEqual("1", medium.acronym)
self.assertEqual(medium.description, "NUTRIENT BROTH/AGAR I")
strains = list(parsed_data["strains"])
strain = strains[0]
self.assertEqual(strain.publications[0].id, 1)
self.assertEqual(strain.publications[0].title, 'Cosa')
self.assertEqual(strain.id.number, "1")
pprint(strain.dict())
def xtest_mirri_excel_parser_invalid_fail(self):
in_path = TEST_DATA_DIR / "invalid.mirri.xlsx"
with in_path.open("rb") as fhand:
try:
parse_mirri_excel(fhand, version="20200601")
self.fail()
except ValidationError:
pass
def xtest_mirri_excel_parser_invalid(self):
in_path = TEST_DATA_DIR / "invalid.mirri.xlsx"
with in_path.open("rb") as fhand:
parsed_data = parse_mirri_excel(
fhand, version="20200601")
errors = parsed_data["errors"]
for _id, _errors in errors.items():
print(_id, _errors)
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'MirriExcelTests.test_mirri_excel_parser_invalid']
unittest.main()

View File

@ -1,589 +0,0 @@
from datetime import datetime
import unittest
from pathlib import Path
from itertools import chain
from mirri.validation.tags import (
CHOICES,
COORDINATES,
CROSSREF,
CROSSREF_NAME,
DATE,
MATCH,
MISSING,
MULTIPLE,
NUMBER,
REGEXP,
SEPARATOR,
TAXON,
TYPE,
UNIQUE,
VALUES
)
from mirri.validation.excel_validator import (
is_valid_choices,
is_valid_coords,
is_valid_crossrefs,
is_valid_date,
is_valid_missing,
is_valid_number,
is_valid_regex,
is_valid_taxon,
is_valid_unique,
is_valid_file,
validate_mirri_excel,
)
TEST_DATA_DIR = Path(__file__).parent / "data"
TS_VALUE = "value"
TS_CONF = "conf"
TS_ASSERT = "assert_func"
class MirriExcelValidationTests(unittest.TestCase):
def test_validation_structure(self):
in_path = TEST_DATA_DIR / "invalid_structure.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
entities = []
err_codes = []
for ett, errors in error_log.get_errors().items():
entities.append(ett)
err_codes.extend([err.code for err in errors])
self.assertIn("EFS", entities)
self.assertIn("STD", entities)
self.assertIn("GOD", entities)
self.assertIn("GMD", entities)
self.assertIn("EFS03", err_codes)
self.assertIn("EFS06", err_codes)
self.assertIn("EFS08", err_codes)
self.assertIn("GOD06", err_codes)
self.assertIn("GMD01", err_codes)
self.assertIn("STD05", err_codes)
self.assertIn("STD08", err_codes)
self.assertIn("STD12", err_codes)
def test_validation_content(self):
in_path = TEST_DATA_DIR / "invalid_content.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
entities = []
err_codes = []
for ett, errors in error_log.get_errors().items():
entities.append(ett)
err_codes.extend([err.code for err in errors])
self.assertTrue(len(err_codes) > 0)
self.assertNotIn("EFS", entities)
self.assertIn("STD", entities)
self.assertIn("GOD", entities)
self.assertIn("GID", entities)
self.assertIn("GOD04", err_codes)
self.assertIn("GOD07", err_codes)
self.assertIn("GID03", err_codes)
self.assertIn("STD11", err_codes)
self.assertIn("STD15", err_codes)
self.assertIn("STD22", err_codes)
self.assertIn("STD04", err_codes)
self.assertIn("STD10", err_codes)
self.assertIn("STD07", err_codes)
self.assertIn("STD14", err_codes)
self.assertIn("STD16", err_codes)
def test_validation_valid(self):
in_path = TEST_DATA_DIR / "valid.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
self.assertTrue(len(error_log.get_errors()) == 0)
class ValidatoionFunctionsTest(unittest.TestCase):
def test_is_valid_regex(self):
tests = [
{
TS_VALUE: "abcDEF",
TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123456",
TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "123456",
TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abcdef",
TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "abc 123",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123 abc",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123 ",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_regex(value, conf))
def test_is_valid_choices(self):
tests = [
{
TS_VALUE: "1",
TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "1, 3",
TS_CONF: {
TYPE: CHOICES,
VALUES: ["1", "2", "3", "4"],
MULTIPLE: True,
SEPARATOR: ","
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "5",
TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_choices(value, conf))
def test_is_valid_crossref(self):
tests = [
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
},
TS_ASSERT: self.assertTrue,
},
{
TS_VALUE: "123",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
},
TS_ASSERT: self.assertFalse,
},
{
TS_VALUE: "abc, def",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
MULTIPLE: True,
SEPARATOR: ",",
},
TS_ASSERT: self.assertTrue,
},
{
TS_VALUE: "abc, 123",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
MULTIPLE: True,
SEPARATOR: ",",
},
TS_ASSERT: self.assertFalse,
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_crossrefs(value, conf))
def test_is_valid_missing(self):
tests = [
{
TS_VALUE: 1,
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abc",
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: None,
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_missing(value, conf))
def test_is_valid_date(self):
tests = [
{
TS_VALUE: '2020-04-07',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020/04/07',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: datetime(2021, 5, 1),
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020-05',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020/05',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 2020,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2021 05 01',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '04-07-2020',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '2021-02-31',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '2021-15',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '15-2021',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 3000,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: -2020,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_date(value, conf))
def test_is_valid_coordinates(self):
tests = [
{
TS_VALUE: "23; 50",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "-90; -100",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "90; 100",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "0; 0",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10; 20; 5",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10; 20; -5",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "91; 50",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "87; 182",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "-200; 182",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "20, 40",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "abc def",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 123,
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_coords(value, conf))
def test_is_valid_number(self):
tests = [
{
TS_VALUE: 1,
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 2.5,
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10.5",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "min": 0},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "max": 10},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "min": 0, "max": 10},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "hello",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 10,
TS_CONF: {TYPE: NUMBER, "max": 5},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 0,
TS_CONF: {TYPE: NUMBER, "min": 5},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_number(value, conf))
def test_is_valid_taxon(self):
tests = [
{
TS_VALUE: 'sp. species',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp species subsp. subspecies',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp species subsp. subspecies var. variety',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp taxon',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'Candidaceae',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'sp sp species',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 'spp species abc. def',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_taxon(value, conf))
def test_is_valid_unique(self):
tests = [
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {}
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "jkl",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {
"values": {"abc": '',
"def": '',
"ghi": ''},
}
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {
"values": {"abc": '',
"def": '',
"ghi": ''},
}
},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_unique(value, conf))
def test_is_valid_file(self):
tests = [
{
TS_VALUE: TEST_DATA_DIR / "invalid_structure.mirri.xlsx",
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: TEST_DATA_DIR / "invalid_excel.mirri.json",
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_file(value,))
if __name__ == "__main__":
import sys
# sys.argv = ['',
# 'ValidatoionFunctionsTest.test_is_valid_regex']
unittest.main()

View File

@ -1,24 +0,0 @@
import unittest
from pathlib import Path
from mirri.io.writers.mirri_excel import write_mirri_excel
from mirri.io.parsers.mirri_excel import parse_mirri_excel
TEST_DATA_DIR = Path(__file__).parent / "data"
class MirriExcelTests(unittest.TestCase):
def test_valid_excel(self):
in_path = TEST_DATA_DIR / "valid.mirri.full.xlsx"
parsed_data = parse_mirri_excel(in_path.open('rb'), version="20200601")
strains = parsed_data["strains"]
growth_media = parsed_data["growth_media"]
out_path = Path("/tmp/test.xlsx")
write_mirri_excel(out_path, strains, growth_media, version="20200601")
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'BiolomicsWriter.test_mirri_excel_parser_invalid']
unittest.main()

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -62,6 +62,10 @@ class Entity():
def GID(self) -> str:
return 'Genomic Information'
def VRS(self) -> str:
return 'Version'
def OTD(self) -> str:
return 'Ontobiotope'

View File

@ -0,0 +1,547 @@
from typing import Optional
class ErrorMessage():
"""Error message
Args:
code (str): Error code.
pk (str | optional): The instance's primary key that triggered the error. Defaults to None.
value (str | optional): The instance's value that triggered the error. Defaults to None.
"""
def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None):
self.code = code.upper()
self.pk = pk
self.value = value
@property
def _codes(self) -> list:
return [
func
for func in dir(self)
if func.isupper() and
callable(getattr(self, func)) and
not func.startswith("__")
]
@property
def _messages(self) -> dict:
return {code: getattr(self, code) for code in self._codes}
@property
def message(self) -> str:
if not self._validate_code():
raise ValueError(f"{self.code} not found")
return self._messages[self.code]()
@property
def code(self) -> str:
return self._code
@code.setter
def code(self, code: str) -> None:
self._code = code.upper()
def _validate_code(self) -> bool:
return self.code in self._codes
@property
def pk(self) -> str:
return self._pk
@pk.setter
def pk(self, pk: str) -> None:
self._pk = pk
@property
def value(self) -> str:
return self._value
@value.setter
def value(self, value: str) -> None:
self._value = value
"""
Excel File Structure Error Codes
"""
def EXL00(self):
return f"The provided file '{self.pk}' is not an excel(xlsx) file"
def EFS01(self):
return "The 'Growth media' sheet is missing. Please check the provided excel template."
def EFS02(self):
return "The 'Geographic origin' sheet is missing. Please check the provided excel template."
def EFS03(self):
return "The 'Literature' sheet is missing. Please check the provided excel template."
def EFS04(self):
return "The 'Sexual state' sheet is missing. Please check the provided excel template."
def EFS05(self):
return "The 'Strains' sheet is missing. Please check the provided excel template."
def EFS06(self):
return "The 'Ontobiotope' sheet is missing. Please check the provided excel template."
def EFS07(self):
return "The 'Markers' sheet is missing. Please check the provided excel template."
def EFS08(self):
return "The 'Genomic information' sheet is missing. Please check the provided excel template."
def EFS09(self):
return "The 'Version' sheet is missing. Please check the provided excel template."
"""
Growth Media Error Codes
"""
def GMD01(self):
return "The 'Acronym' column is a mandatory field in the Growth Media sheet."
def GMD02(self):
return "The 'Acronym' column is empty or has missing values."
def GMD03(self):
return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty."
def GMD04(self):
return f"The 'Description' for growth media with Acronym {self.pk} is missing."
"""
Geographic Origin Error Codes
"""
def GOD01(self):
return "The 'ID' column is a mandatory field in the Geographic Origin sheet."
def GOD02(self):
return "The 'ID' column is empty or has missing values."
def GOD03(self):
return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD04(self):
return f"The 'Country' for geographic origin with ID {self.pk} is missing."
def GOD05(self):
return f"The 'Country' for geographic origin with ID {self.pk} is incorrect."
def GOD06(self):
return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD07(self):
return f"The 'Locality' for geographic origin with ID {self.pk} is missing."
"""
Literature Error Codes
"""
def LID01(self):
return "The 'ID' column is a mandatory field in the Literature sheet."
def LID02(self):
return "The 'ID' column empty or missing values."
def LID03(self):
return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID04(self):
#return f"The 'Full reference' for literature with ID {self.pk} is missing."
def LID05(self):
return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID06(self):
#return f"The 'Authors' for literature with ID {self.pk} is missing."
def LID07(self):
return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID08(self):
#return f"The 'Title' for literature with ID {self.pk} is missing."
def LID09(self):
return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty."
#def LID10(self):
#return f"The 'Journal' for literature with ID {self.pk} is missing."
def LID11(self):
return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID12(self,):
return f"The 'Year' for literature with ID {self.pk} is missing."
def LID13(self):
return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID14(self):
return f"The 'Volume' for literature with ID {self.pk} is missing."
def LID15(self):
return "The 'First page' column is a mandatory field. The column can not be empty."
def LID16(self):
return f"The 'First page' for literature with ID {self.pk} is missing."
def LID17(self):
return( f"There are four types of ways to fill in the 'Literature' sheet.",
"1st- Columns 'ID' and 'DOI' must be obrigatory.",
"2nd-Columns 'ID' and 'PMID' are obrigatory.",
"3rd-Columns 'ID' and 'Full reference' are obrigatory.",
"In the alternative of these three types of forms not being filled in, we have:",
"4th-Columns 'ID', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'.")
def LID18(self):
return "The 'PMID' column is a mandatory field. The column can not be empty."
#def LID19(self):
#return f"PMID for literature with ID {self.pk} is missing."
def LID20(self):
return "The 'DOI' column is a mandatory field. The column can not be empty."
#def LID21(self):
#return f"DOI for literature with ID {self.pk} is missing."
"""
Strains Error Codes
"""
def STD01(self):
return "The 'accessionNumber' column is a mandatory field in the Strains sheet."
def STD02(self):
return "The 'accessionNumber' column is empty or has missing values."
def STD03(self):
return f"The 'accessionNumber' must be unique. The '{self.value}' is repeated."
def STD04(self):
return (f"The 'accessionNumber' {self.pk} is not according to the specification."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD05(self):
return f"The 'useRestrictions' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD06(self):
return f"The 'useRestrictions' for strain with accessionNumber {self.pk} is missing."
def STD07(self):
return (f"The 'useRestrictions' for strain with accessionNumber {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD08(self):
return f"The 'nagoyaConditions' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD09(self):
return f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is missing."
def STD10(self):
return (f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD11(self):
return (f"The 'registeredCollection' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD12(self):
return "The 'riskGroup' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD13(self):
return f"The 'riskGroup' for strain with accessionNumber {self.pk} is missing."
def STD14(self):
return (f"The 'riskGroup' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.")
def STD15(self):
return (f"The 'dualUse' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD16(self):
return (f"The “euQuarantine” for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD17(self):
return f"The 'organismType' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD18(self):
return f"The 'organismType' for strain with accessionNumber {self.pk} is missing."
def STD19(self):
return (f"The 'organismType' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', "
"'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.")
def STD20(self):
return f"The 'speciesName' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD21(self):
return f"The 'speciesName' for strain with accessionNumber {self.pk} is missing."
def STD22(self):
return f"The 'speciesName' for strain with accessionNumber {self.pk} is incorrect."
def STD23(self):
return (f"The 'hybrid' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD24(self):
return (f"The 'depositHistory' for strain with accessionNumber {self.pk} is incorrect."
"The field includes entries separated by '<' meaning 'received from'."
"Entries may include persons or CCs. The name of the CC should be followed by"
"the month, when available, and year of the acquisition. Between parentheses,"
"the strain designation or CC numbers and/or a name can also be entered when "
"a name change has occurred.")
def STD25(self):
return (f"The 'depositDate' for strain with accessionNumber {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD26(self):
return (f"The 'accessionDate' for strain with accessionNumber {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD27(self):
return (f"The 'collectionDate' for strain with accessionNumber {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD28(self):
return (f"The 'isolationDate' for strain with accessionNumber {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD29(self):
return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect."
" It must have two decimal numbers separated by ','")
def STD30(self):
return f"The 'temperatureGrowthRange' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD31(self):
return f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is missing."
def STD32(self):
return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect."
" It must have two decimal numbers separated by ','.")
def STD33(self):
return ("The 'recommendedTemperature' column is a mandatory field in the Strains Sheet. The column can not be empty.")
def STD34(self):
return f"The 'recommendedTemperature' for strain with accessionNumber {self.pk} is missing."
def STD35(self):
return f"The value of 'recommendedTemperature' for strain with accessionNumber {self.pk} is not in the Growth Media Sheet."
def STD36(self):
return f"The 'supplyForms' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD37(self):
return f"The 'supplyForms' for strain with accessionNumber {self.pk} is missing."
def STD38(self):
return f"The value of 'supplyForms' for strain with accessionNumber {self.pk} is not in the Forms of Supply Sheet."
def STD39(self):
return (f"The 'geographicCoordinates' column for strain with accessionNumber {self.pk} is incorrect."
"The allowed formats are two, three or four decimal numbers separated by ','. Moreover, the first number must be."
"between [-90, 90], the second between [-180, 180], and the third and fourth refers to the precision and altitude, defined by decimal numbers."
"Put a question mark for lack of precision or altitude when one of them is missing. Leave the values blank when both are missing. ")
def STD40(self):
return (f"The 'country' column for strain with accessionNumber {self.pk} is incorrect."
"The allowed formats are one decimal number between [-200, 8000].")
def STD54(self):
return (f"The 'country'column is a mandatory field in the Strains Sheet. The column can not be empty.")
def STD55(self):
return (f"The 'country' for strain with accessionNumber {self.pk} is missing.")
def STD41(self):
return f"The value of 'ontobiotopeTerms' for strain with accessionNumber {self.pk} is not in the Ontobiotope Sheet."
def STD42(self):
return (f"The 'gmo' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2")
def STD43(self):
return (f"The 'sexualState' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', "
"'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'")
def STD44(self):
return (f"The 'ploidy' for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9")
def STD45(self):
msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. "
msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage"
return msg
def STD46(self):
return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is not according to specification."
f"The 'geographicOrigin' column must consist of the ID's associated with the Geographic origin sheet.")
def STD47(self):
return "The 'country' column is a mandatory field in the Strains sheet."
def STD48(self):
return "The 'country' column is empty or has missing values."
def STD49(self):
return (f"The “qps” for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD50(self):
return (f"The “axenicCulture” for strain with accessionNumber {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Axenic', 'Not axenic'.")
def STD51(self):
return f"The 'mirriAccessionNumber' must be unique. The '{self.pk}' is repeated."
def STD52(self):
return (f"The 'mirriAccessionNumber' for strain with accessionNumber {self.pk} is incorrect."
" It must have the expression MIRRI followed by 7 digits")
def STD53(self):
return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect."
" The displayed expression it should be composed of: site name ';' website url." )
def STD56(self):
return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. " )
def STD57(self):
return (f"The 'country' for strain with accessionNumber {self.pk} is incorrect."
"This information must be expressed by using the ISO-3166 standard for country"
"codes. The preferred set is ISO 3166-1 alpha-2 (two letters code), but ISO 3166-"
"1 alpha-3 (three letters code) is also accepted. Former country codes must"
"follow standards part three ISO 3166-3 (four letters code). Only one code can"
"be included." )
def STD58(self):
return (f"The 'mtaFile' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. " )
def STD59(self):
return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect."
"The displayed expression it should be composed of: name ';' website url."
"When only one URL is provided, the title may be omitted. In this case, the URL"
"will be shown in clear to users." )
def STD60(self):
return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect."
" The url must be valid. ")
def STD61(self):
return (f"The 'sequenceLiterature' for strain with accessionNumber {self.pk} is incorrect."
"Numeric identifiers separated by a semicolon ';'.")
def STD62(self):
return (f"The 'plasmidCollections' for strain with accessionNumber {self.pk} is incorrect."
"It should include the name of the plasmid followed by the CC number in"
"parentheses. More than one plasmid can be reported, separated by ';'. "
"Plasmid names should be provided as free text."
"CC numbers should be composed by the CC acronym followed by a number"
"separated by a space'. Numeric identifiers separated by a semicolon ';'.")
def STD63(self):
return (f"The 'otherCollectionNumbers' for strain with accessionNumber {self.pk} is incorrect."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD64(self):
return (f"The 'type' for strain with accessionNumber {self.pk} is incorrect."
f"Your value is {self.value} and the accepted values are 1, 2.")
def STD65(self):
return (f"The 'status' for strain with accessionNumber {self.pk} is incorrect."
"The structure should be 'type of <character string>.")
def STD68(self):
return (f"The 'geographicOrigin'column is a mandatory field in the Strains Sheet. The column can not be empty.")
def STD69(self):
return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is missing.")
"""
Genomic Information Error Codes
"""
def GID01(self):
return f"The 'Strain accessionNumber' (Strain AN) column is a mandatory field in the Genomic Information Sheet."
def GID02(self):
return f"The 'Strain accessionNumber' (Strain AN) column is empty or has missing values."
def GID03(self):
return f"The value of 'Strain accessionNumber' (Strain AN) {self.value} is not in the Strains sheet."
def GID04(self):
return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID05(self):
return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing."
def GID06(self):
return f"The value of 'Marker' {self.value} is not in the Markers sheet."
def GID07(self):
return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID08(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing."
def GID09(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect."
def GID10(self):
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
" It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.")
def GID11(self):
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
"An INSDC accession number is an alphanumeric"
"code made by a fixed number of letters followed by a fixed number of digits,"
"without any separation. For sequences, the code is currently made of two"
"letters followed by six numbers.")
"""
Version Error Codes
"""
def VRS01(self):
return "The 'Version' columns is a mandatory field in the Version Sheet."
def VRS02(self):
return "The 'Version' columns is empty or has missing values."
def VRS03(self):
return "The 'Date' columns is a mandatory field in the Control Sheet."
def VRS04(self):
return "The 'Date' columns is empty or has missing values."
def VRS05(self):
return f"The version {self.value} is the only one to be used."
"""
Ontobiotope Error Codes
"""
def OTD01(self):
return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet."
def OTD02(self):
return "The 'ID' columns is empty or has missing values."
#def OTD03(self):
return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty."
#def OTD04(self):
return f"The 'Name' for ontobiotope with ID {self.pk} is missing."

View File

@ -4,27 +4,51 @@ from io import BytesIO
from zipfile import BadZipfile
from datetime import datetime
from calendar import monthrange
import requests
from openpyxl import load_workbook
import pycountry
from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet
from mirri.validation.error_logging import ErrorLog, Error
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON,
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO)
MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON,
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE,
HISTORY,NAGOYA1, VERSION)
from mirri.settings import LOCATIONS, SUBTAXAS
from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF
from mirri.validation.validation_conf_12052023 import version_config
from mirri.validation.validation_conf_12052023 import MIRRI_12052023_VALLIDATION_CONF
def validate_mirri_excel(fhand, version="20200601"):
if version == "20200601":
configuration = MIRRI_20200601_VALLIDATION_CONF
def validate_mirri_excel(fhand, version= "5.1.2" ):
if version == "5.1.2":
configuration = MIRRI_12052023_VALLIDATION_CONF
else:
raise NotImplementedError("Only version20200601 is implemented")
raise NotImplementedError("Only version 5.1.2 is implemented")
return validate_excel(fhand, configuration)
def version(value , validation_conf=None):
if value is None:
return True
try:
for version in version_config:
if value == version :
return True
except:
return False
def validate_country_code(value,validation_conf=None):
if value is None:
return True
try:
if pycountry.countries.get(alpha_2=value) or pycountry.countries.get(alpha_3=value) or pycountry.historic_countries.get(alpha_4 = value):
return True
except:
return False
def validate_excel(fhand, configuration):
validation_conf = configuration['sheet_schema']
@ -185,11 +209,14 @@ def validate_row(row, validation_steps, in_memory_sheets):
kind = validation_step[TYPE]
error_code = validation_step[ERROR_CODE]
if kind == NAGOYA:
if not is_valid_nagoya(row, in_memory_sheets):
if not is_valid_nagoya_v12052023(row, in_memory_sheets):
return error_code
elif kind == BIBLIO:
if not is_valid_pub(row):
return error_code
elif kind == NAGOYA1:
if not is_valid_nago(row):
return error_code
else:
msg = f'{kind} is not a recognized row validation type method'
raise NotImplementedError(msg)
@ -208,48 +235,69 @@ def validate_cell(value, validation_steps, crossrefs, shown_values, label):
if error_code is not None:
return error_code
def is_valid_pub(row):
pub_id = row.get('ID', None)
pub_pmid = row.get('PMID', None)
pub_doi = row.get('DOI', None)
title = row.get('Title', None)
full_reference = row.get('Full reference', None)
authors = row.get('Authors', None)
journal = row.get('Journal', None)
year = row.get('Year', None)
volumen = row.get('Volumen', None)
volumen = row.get('Volume', None)
first_page = row.get('First page', None)
book_title = row.get('Book title', None)
editors = row.get('Editors', None)
publishers = row.get('Publishers', None)
if full_reference:
if (pub_id != None and pub_doi != None) or (pub_id != None and pub_pmid != None) or (pub_id != None and full_reference != None) or (pub_id != None and authors != None and title != None and journal != None and year != None and volumen != None and first_page != None) :
return True
is_journal = bool(title)
if (is_journal and (not authors or not journal or not not year or
not volumen or not first_page)):
return False
if (not is_journal and (not authors or not year or
not editors or not publishers or not book_title)):
return False
# if (is_journal and (not authors or not journal or not not year or
# not volumen or not first_page)):
# return False
#if (not is_journal and (not authors or not year or
# not editors or not publishers or not book_title)):
# return False
return False
def is_valid_nago(row):
if not row:
return True
status = row.get("status", None)
type = row.get("type", None)
regex = r'^[a-zA-Z\s.\'-]+$'
if status != None and type != None:
if (re.match(regex, status) and type==1):
return False
if (type == 2 and status is None):
return False
return True
def parsee_mirri_excel(row, in_memory_sheets, version=""):
if version == "12052023":
return is_valid_nagoya_v12052023 (row, in_memory_sheets)
else:
raise NotImplementedError("Only version is implemented")
def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('Geographic origin', None)
def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('geographicOrigin', None)
if location_index is None:
country = None
else:
geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {})
country = geo_origin.get('Country', None)
_date = row.get("Date of collection", None)
_date = row.get("collectionDate", None)
if _date is None:
_date = row.get("Date of isolation", None)
_date = row.get("isolationDate", None)
if _date is None:
_date = row.get("Date of deposit", None)
_date = row.get("depositDate", None)
if _date is None:
_date = row.get("Date of inclusion in the catalogue", None)
_date = row.get("accessionDate", None)
if _date is not None:
year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4])
else:
@ -258,8 +306,8 @@ def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity
if year is not None and year >= 2014 and country is None:
return False
return True
return True
def is_valid_regex(value, validation_conf):
if value is None:
@ -310,7 +358,9 @@ def is_valid_choices(value, validation_conf):
values = [v.strip() for v in str(value).split(separator)]
else:
values = [str(value).strip()]
sorted_values = sorted(values)
if sorted_values != values:
return False
return all(value in choices for value in values)
@ -352,20 +402,136 @@ def is_valid_date(value, validation_conf):
return True
def is_valid_coords(value, validation_conf=None):
# sourcery skip: return-identity
def is_valid_dominio(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
latitude = float(items[0])
longitude = float(items[1])
if len(items) > 2:
precision = float(items[2])
if latitude < -90 or latitude > 90:
return False
if longitude < -180 or longitude > 180:
if len(items) >1:
for i in range(0, len(items),2):
nameSite = str(items[i])
urlSite = str(items[i+1])
dominio = urlSite.split(".")[-2]
if nameSite.lower() != dominio:
return False
return True
except:
return False
def is_valid_title(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
if len(items) >1:
for i in range(0, len(items),2):
nameSite = (items[i])
urlSite = str(items[i+1])
regex = r'^(http|https):\/\/[a-z0-9\-\.]+\.[a-z]{2,}([/a-z0-9\-\.]*)*$'
if re.match(regex, nameSite) or isinstance(nameSite, int) or nameSite == '':
return False
return True
except:
return False
def is_valid_url_title(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
if len(items) ==1:
urlSite = str(items[0])
response = requests.head(urlSite)
if response.status_code != 200:
return False
else:
items = [i.strip() for i in value.split(";")]
for i in range(0, len(items),2):
nameSite = (items[i])
urlSite = str(items[i+1])
response = requests.head(urlSite)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_url_dominio(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
for i in range(0, len(items),2):
nameSite = str(items[i])
urlSite = str(items[i+1])
response = requests.head(urlSite)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_just_url(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
for i in items:
nameSite = str(items[0])
response = requests.head(i)
if response.status_code != 200:
return False
return True
except:
return False
def is_valid_history(value, validation_conf=None):
if value is None:
return True
try:
items = [i.strip() for i in value.split("<")]
for i in items:
regex1 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*((19|20)\d{2})'
regex2 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*[a-zA-Z0-9 &,;.''-] (19|20)\d{2}\s\([a-zA-Z0-9 &,;.''-:]+\)'
regex3 = r'^[a-zA-Z0-9 &,;.:''-]+\,?\s*[a-zA-Z0-9 &,;.''-]'
regex4 = r'^[a-zA-Z0-9 &,;.''-]+,?\s*(19|20)\d{2}\s\([a-zA-Z0-9 .''-,;&:]+\)'
regex5 = r'^[a-zA-Z0-9 &,;.:''-]+,?\s*\([a-zA-Z0-9 &,;.''-:]+\) (19|20)\d{2}'
if re.match(regex1, i):
return True
elif re.match(regex2, i):
return True
elif re.match(regex3, i):
return True
elif re.match(regex4, i):
return True
elif re.match(regex5, i):
return True
else:
return False
except:
return False
def is_valid_coords(value, validation_conf=None):
if value is None:
return True
try:
regex1 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)(\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?))*$'
regex2 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?)\s*;\s*-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?)\s*;\s*(\d+\.\d+|\?)\s*;\s*(\d+\.\d+|\?)$|^(\d+\.\d+|\?)$|^\s*;\s*$'
if not re.match(regex1, value) and not re.match(regex2, value):
return False
return True
except:
return False
@ -375,24 +541,6 @@ def is_valid_missing(value, validation_conf=None):
return value is not None
def is_valid_number(value, validation_conf):
if value is None:
return True
try:
value = float(value)
except TypeError:
return False
except ValueError:
return False
_max = validation_conf.get('max', None)
_min = validation_conf.get('min', None)
if (_max is not None and value > _max) or (_min is not None and value < _min):
return False
return True
def is_valid_taxon(value, validation_conf=None):
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, ';')
@ -429,6 +577,8 @@ def _is_valid_taxon(value):
def is_valid_unique(value, validation_conf):
if not value:
return True
label = validation_conf['label']
shown_values = validation_conf['shown_values']
if label not in shown_values:
@ -444,7 +594,6 @@ def is_valid_unique(value, validation_conf):
return True
def is_valid_file(path):
try:
with path.open("rb") as fhand:
@ -464,8 +613,15 @@ VALIDATION_FUNCTIONS = {
CROSSREF: is_valid_crossrefs,
DATE: is_valid_date,
COORDINATES: is_valid_coords,
NUMBER: is_valid_number,
TAXON: is_valid_taxon,
TITLE: is_valid_title,
DOMINIO: is_valid_dominio,
URL_TITLE: is_valid_url_title,
URL_DOMINIO: is_valid_url_dominio,
JUST_URL: is_valid_just_url,
ISO: validate_country_code,
HISTORY: is_valid_history,
VERSION: version,
UNIQUE: is_valid_unique}

View File

@ -16,9 +16,20 @@ MATCH = 'match'
VALUES = 'values'
DATE = 'date'
COORDINATES = 'coord'
COORDINATES1 = 'coord1'
NUMBER = 'number'
TAXON = 'taxon'
UNIQUE = 'unique'
ROW_VALIDATION = 'row_validation'
NAGOYA = 'nagoya'
BIBLIO = 'bibliography'
DOMINIO= 'is_valid_dominio'
TITLE= 'is_valid_title'
URL_DOMINIO = 'urll_valid_dominio'
URL_TITLE= 'is_valid_url_title'
ISO = 'validate_country_code'
JUST_URL= 'is_valid_just_url'
HISTORY= 'is_valid_history'
MEU='is_valid_crossrefs_meu'
NAGOYA1 = 'nayoga1'
VERSION = 'version'

View File

@ -1,14 +1,20 @@
#!/usr/bin/env python
import pandas as pd
import sys
from pathlib import Path
from mirri.validation.excel_validator import validate_mirri_excel
import warnings
warnings.simplefilter("ignore")
from mirri.validation.excel_validator import validate_mirri_excel
def main():
path = Path(sys.argv[1])
error_log = validate_mirri_excel(path.open("rb"))
version = str(sys.argv[2])
try:
error_log = validate_mirri_excel(path.open("rb"), version=version)
except NotImplementedError as e:
print(e)
for errors in error_log.get_errors().values():
for error in errors:

View File

@ -1,10 +1,13 @@
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,
VALIDATION, VALUES, BIBLIO)
MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,VERSION,
VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1)
from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS)
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET)
# GEOGRAPHIC_ORIGIN
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
@ -12,9 +15,12 @@ from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "Accession number",
FIELD: "accessionNumber",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
@ -23,16 +29,24 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "Restrictions on use",
FIELD: "useRestrictions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "mirriAccessionNumber",
VALIDATION: [
{TYPE: UNIQUE, ERROR_CODE: 'STD51'},
{TYPE: REGEXP, MATCH: "^MIRRI[0-9]{7}$", ERROR_CODE: "STD52"},
],
},
{
FIELD: "Nagoya protocol restrictions and compliance conditions",
FIELD: "nagoyaConditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
@ -41,29 +55,53 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "ABS related files",
VALIDATION: [],
FIELD: "absFile",
VALIDATION: [
{TYPE: TITLE, ERROR_CODE: "STD59"},
{TYPE: URL_TITLE, ERROR_CODE: "STD60",
MULTIPLE: True, SEPARATOR: ";"},
],
},
{
FIELD: "siteLinks",
VALIDATION: [
{TYPE: DOMINIO, ERROR_CODE: "STD53",
MULTIPLE: False, SEPARATOR: ";"},
{TYPE: URL_DOMINIO, ERROR_CODE: "STD56",
MULTIPLE: False, SEPARATOR: ";"},
],
},
{
FIELD: "MTA file",
VALIDATION: [],
FIELD: "mtaFile",
VALIDATION: [
{TYPE: JUST_URL, ERROR_CODE: "STD58",
MULTIPLE: True, SEPARATOR: ";"},
],
},
{
FIELD: "Other culture collection numbers",
# VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
# MULTIPLE: True, SEPARATOR: ";"}
# ]
FIELD: "otherCollectionNumbers",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([^ ]* [^ ]*)(; [^ ]* [^ ]*)*$", ERROR_CODE: "STD63",
MULTIPLE: True, SEPARATOR: ';'},
#{TYPE: CROSSREF, CROSSREF_NAME: "Strains", ERROR_CODE: "STD64"},
]
},
{
FIELD: "Strain from a registered collection",
FIELD: "registeredCollection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "type",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD64"},
]
},
{
FIELD: "Risk Group",
FIELD: "riskGroup",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
@ -72,33 +110,41 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "Dual use",
FIELD: "dualUse",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "Quarantine in Europe",
FIELD: "euQuarantine",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "axenicCulture",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["Axenic", "Not axenic"],
ERROR_CODE: "STD50"}
]
},
{
FIELD: "Organism type",
FIELD: "organismType",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi",
"Phage", "Plasmid", "Virus", "Yeast",
"1", "2", "3", "4", "5", "6", "7", "8", "9"],
"Cyanobacteria", "Filamentous Fungi", "Filamentous fungi",
"Yeast", "Microalgae",
"1", "2", "3", "4", "5", "6", "7"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "Taxon name",
FIELD: "speciesName",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
@ -107,73 +153,69 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "Infrasubspecific names",
FIELD: "infrasubspecificNames",
VALIDATION: []
},
{
FIELD: "Comment on taxonomy",
FIELD: "taxonomyComments",
VALIDATION: []
},
{
FIELD: "Interspecific hybrid",
FIELD: "hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "Status",
FIELD: "status",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^(type of|neotype of|holotype of |epitype of) ([a-zA-Z .'-]+)$", ERROR_CODE: "STD65"},
]
},
{
FIELD: "History of deposit",
FIELD: "depositHistory",
VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex
# MULTIPLE: True, SEPARATOR: ";"}
{TYPE: HISTORY, ERROR_CODE: 'STD24'},
]
},
{
FIELD: "Depositor"
FIELD: "depositor",
VALIDATION: []
},
{
FIELD: "Date of deposit",
FIELD: "depositDate",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "Date of inclusion in the catalogue",
FIELD: "accessionDate",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "Collected by",
FIELD: "collector",
VALIDATION: []
},
{
FIELD: "substrate",
VALIDATION: []
},
{
FIELD: "Date of collection",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD27"},
]
},
{
FIELD: "Isolated by",
},
{
FIELD: "Date of isolation",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD28"},
]
},
{
FIELD: "Substrate/host of isolation",
},
{
FIELD: "Tested temperature growth range",
FIELD: "temperatureGrowthRange",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended growth temperature",
FIELD: "recommendedTemperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
@ -182,17 +224,9 @@ STRAIN_FIELDS = [
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended medium for growth",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "Form of supply",
FIELD: "supplyForms",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
@ -202,52 +236,70 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "Other denomination",
FIELD: "otherDenomination",
VALIDATION: []
},
{
FIELD: "Coordinates of geographic origin",
FIELD: "geographicCoordinates",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
FIELD: "Altitude of geographic origin",
VALIDATION: [
{TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Geographic origin",
FIELD: "geographicOrigin",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD68"},
{TYPE: MISSING, ERROR_CODE: "STD69"},
{TYPE: CROSSREF, CROSSREF_NAME: "Geographic origin", ERROR_CODE: "STD46"},
]
},
{
FIELD: "isolationHabitat",
VALIDATION: []
},
{
FIELD: "Isolation habitat",
},
{
FIELD: "Ontobiotope term for the isolation habitat",
FIELD: "ontobiotopeTerms",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "qps",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD49"}
]
},
{
FIELD: "GMO",
FIELD: "gmo",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "GMO construction information",
FIELD: "gmoConstruction",
VALIDATION: []
},
{
FIELD: "Mutant information",
FIELD: "mutant",
VALIDATION: []
},
{
FIELD: "Genotype",
FIELD: "genotype",
VALIDATION: []
},
{
FIELD: "Sexual state",
FIELD: "Plant pathogenicity code",
VALIDATION: []
},
{
FIELD: "sexualState",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
@ -258,46 +310,78 @@ STRAIN_FIELDS = [
]
},
{
FIELD: "Ploidy",
FIELD: "ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4", "5", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "Plasmids",
FIELD: "plasmids",
VALIDATION: []
},
{
FIELD: "Plasmids collections fields",
FIELD: "plasmidCollections",
VALIDATION: [
{TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(\s*;([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$",
ERROR_CODE: "STD62"}
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Literature",
FIELD: "identificationLiterature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "Plant pathogenicity code",
FIELD: "pathogenicity",
VALIDATION: []
},
{
FIELD: "Pathogenicity",
FIELD: "enzymes",
VALIDATION: []
},
{
FIELD: "Enzyme production",
FIELD: "metabolites",
VALIDATION: []
},
{
FIELD: "Production of metabolites",
FIELD: "applications",
VALIDATION: []
},
{
FIELD: "Applications",
FIELD: "remarks",
VALIDATION: []
},
{
FIELD: "Remarks"
FIELD: "sequenceLiterature",
VALIDATION: [
{TYPE: REGEXP, MATCH: "^\d+(\s*;?\s*\d+)*$", ERROR_CODE: "STD61"},
]
},
{
FIELD: "Literature linked to the sequence/genome",
{
FIELD: "recommendedMedium",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD54"},
{TYPE: MISSING, ERROR_CODE: "STD55"},
{TYPE: ISO, ERROR_CODE: "STD57"},
#{TYPE: CROSSREF, CROSSREF_NAME: COUNTRY_CODES_SHEET, ERROR_CODE: "STD57"}
]
},
]
SHEETS_SCHEMA = {
@ -317,7 +401,7 @@ SHEETS_SCHEMA = {
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"}
{TYPE: MISSING, ERROR_CODE: "GOD04"},
]
},
{
@ -389,6 +473,7 @@ SHEETS_SCHEMA = {
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
{TYPE: REGEXP, MATCH: "^[A-Z]{2}[0-9]{6}$", ERROR_CODE: "GID11"},
]
},
{
@ -399,11 +484,9 @@ SHEETS_SCHEMA = {
},
STRAINS: {
"acronym": "STD",
'id_field': 'Accession number',
'id_field': 'accessionNumber',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [
{TYPE: NAGOYA, ERROR_CODE: "STD46"},
],
ROW_VALIDATION: [],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
@ -421,6 +504,18 @@ SHEETS_SCHEMA = {
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "PMID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID18"},
]
},
{
FIELD: "DOI",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID20"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
@ -465,7 +560,6 @@ SHEETS_SCHEMA = {
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
{TYPE: MISSING, ERROR_CODE: "LID16"},
]
},
{
@ -504,13 +598,38 @@ SHEETS_SCHEMA = {
},
{
FIELD: "Name",
VALIDATION: []
},
]
},
CONTROL_SHEET: {
"acronym": "VRS",
"id_field": "Version",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS09"},
COLUMNS: [
{
FIELD: "Version",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD03"},
{TYPE: MISSING, ERROR_CODE: "OTD04"},
{TYPE: MANDATORY, ERROR_CODE: "VRS01"},
{TYPE: MISSING, ERROR_CODE: "VRS02"},
{TYPE: VERSION, ERROR_CODE: "VRS05"},
]
},
{
FIELD: "Date",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "VRS03"},
{TYPE: MISSING, ERROR_CODE: "VRS04"},
]
},
]
},
MARKERS: {
"acronym": "MKD",
"id_field": "Acronym",
@ -527,19 +646,28 @@ SHEETS_SCHEMA = {
},
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID', 'Name'],
LITERATURE_SHEET: ['ID'],
LOCATIONS: ['Locality'],
ONTOBIOTOPE: ['ID'],
LITERATURE_SHEET: ['ID', 'DOI', 'PMID', 'Full reference', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'],
LOCATIONS: ['ID', 'Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["Accession number"],
STRAINS: ["accessionNumber"],
SEXUAL_STATE_SHEET: [],
MARKERS: ["Acronym"],
}
MIRRI_20200601_VALLIDATION_CONF = {
MIRRI_12052023_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}
version_config = {
'5.1.2': MIRRI_12052023_VALLIDATION_CONF,
'date': '12/05/2023'
}