diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 2a0bb6c..0000000 --- a/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -mirri/biolomics/secrets.py -.vscode/launch.json -*.pyc diff --git a/README.md b/README.md deleted file mode 100644 index 43132ef..0000000 --- a/README.md +++ /dev/null @@ -1,34 +0,0 @@ -# MIRRI Utils - -## Installation - -> pip install path_to_package.tar.gz - - -## Description - -A small set of utilities to deal with Mirri Data. - - - A data class to deal with strain data. - - - An excel reader for mirri specification - - - An excel validator for mirri specification - - - An excel writer to create the excel with MIRRI specifications - - -## Update 06-09-2022 -Under the bin directory: -bin\ - upload_strains_to_mirri_is_NEWDB.py - validateNEW.py - -those files are created to inserting the data from the excel files into the database mirridb. - -validateNEW.py: ->the purpose of this file is to be the orchestator for the validations and the calling the upload to mirridb. - - -upload_strains_to_mirri_is_NEWDB.py: -This script is inserting the excel into the database, the code has comments regarding the steps. \ No newline at end of file diff --git a/mirri/TODO.txt b/TODO.txt similarity index 100% rename from mirri/TODO.txt rename to TODO.txt diff --git a/mirri/__init__.py b/__init__.py similarity index 100% rename from mirri/__init__.py rename to __init__.py diff --git a/__pycache__/__init__.cpython-311.pyc b/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..c57a87e Binary files /dev/null and b/__pycache__/__init__.cpython-311.pyc differ diff --git a/__pycache__/settings.cpython-311.pyc b/__pycache__/settings.cpython-311.pyc new file mode 100644 index 0000000..542840d Binary files /dev/null and b/__pycache__/settings.cpython-311.pyc differ diff --git a/__pycache__/settings_v1.cpython-311.pyc b/__pycache__/settings_v1.cpython-311.pyc new file mode 100644 index 0000000..7ee855f Binary files /dev/null and b/__pycache__/settings_v1.cpython-311.pyc differ diff --git a/__pycache__/validate_v5.cpython-311.pyc b/__pycache__/validate_v5.cpython-311.pyc new file mode 100644 index 0000000..b4f0f21 Binary files /dev/null and b/__pycache__/validate_v5.cpython-311.pyc differ diff --git a/bin/delete_duplicated_strain_by_number.py b/bin/delete_duplicated_strain_by_number.py deleted file mode 100644 index 6c8bc46..0000000 --- a/bin/delete_duplicated_strain_by_number.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import sys - -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS - -SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' - - -def get_cmd_args(): - desc = "Upload strains to MIRRI-IS" - parser = argparse.ArgumentParser(description=desc) - parser.add_argument('-a', '--accession_number', required=True, - help='Delete the duplicated items in database for the given accession number') - parser.add_argument('-u', '--ws_user', help='Username of the web service', - required=True) - parser.add_argument('-p', '--ws_password', required=True, - help='Password of the web service user') - parser.add_argument('-c', '--client_id', required=True, - help='Client id of the web service') - parser.add_argument('-s', '--client_secret', required=True, - help='Client secret of the web service') - - args = parser.parse_args() - - return {'accession_number': args.accession_number, 'user': args.ws_user, - 'password': args.ws_password, 'client_id': args.client_id, - 'client_secret': args.client_secret} - - -def write_errors_in_screen(errors, fhand=sys.stderr): - for key, errors_by_type in errors.items(): - fhand.write(f'{key}\n') - fhand.write('-' * len(key) + '\n') - for error in errors_by_type: - if error.pk: - fhand.write(f'{error.pk}: ') - fhand.write(f'{error.message} - {error.code}\n') - fhand.write('\n') - - -def main(): - args = get_cmd_args() - out_fhand = sys.stdout - - client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2', - client_id=args['client_id'], - client_secret=args['client_secret'], - username=args['user'], - password=args['password']) - query = {"Query": [{"Index": 0, - "FieldName": "Collection accession number", - "Operation": "TextExactMatch", - "Value": args['accession_number']}], - "Expression": "Q0", - "DisplayStart": 0, - "DisplayLength": 10} - - result = client.search(STRAIN_WS, query=query) - total = result["total"] - if total == 0: - out_fhand.write('Accession not in database\n') - sys.exit(0) - return None - elif total == 1: - out_fhand.write('Accession is not duplicated\n') - sys.exit(0) - - print(f'Duplicates found: {total}. removing duplicates') - duplicated_ids = [record.record_id for record in result['records']] - for duplicated_id in duplicated_ids[:-1]: - client.delete_by_id(STRAIN_WS, duplicated_id) - - -if __name__ == '__main__': - main() diff --git a/bin/delete_mirri_data.py b/bin/delete_mirri_data.py deleted file mode 100644 index 92bffcb..0000000 --- a/bin/delete_mirri_data.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import sys - -from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS -from mirri.io.parsers.mirri_excel import parse_mirri_excel -from mirri.validation.excel_validator import validate_mirri_excel - -SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' - - -def get_cmd_args(): - desc = "Upload strains to MIRRI-IS" - parser = argparse.ArgumentParser(description=desc) - parser.add_argument('-i', '--input', help='Validated Excel file', - type=argparse.FileType('rb'), required=True) - parser.add_argument('-v', '--spec_version', default='20200601', - help='Version of he specification of the given excel file') - parser.add_argument('-u', '--ws_user', help='Username of the web service', - required=True) - parser.add_argument('-p', '--ws_password', required=True, - help='Password of the web service user') - parser.add_argument('-c', '--client_id', required=True, - help='Client id of the web service') - parser.add_argument('-s', '--client_secret', required=True, - help='Client secret of the web service') - parser.add_argument('-f', '--force_update', required=False, - action='store_true', - help='Use it if you want to update the existing strains') - - args = parser.parse_args() - - return {'input_fhand': args.input, 'user': args.ws_user, - 'version': args.spec_version, - 'password': args.ws_password, 'client_id': args.client_id, - 'client_secret': args.client_secret, 'update': args.force_update} - - -def write_errors_in_screen(errors, fhand=sys.stderr): - for key, errors_by_type in errors.items(): - fhand.write(f'{key}\n') - fhand.write('-' * len(key) + '\n') - for error in errors_by_type: - if error.pk: - fhand.write(f'{error.pk}: ') - fhand.write(f'{error.message} - {error.code}\n') - fhand.write('\n') - - -def main(): - args = get_cmd_args() - input_fhand = args['input_fhand'] - spec_version = args['version'] - out_fhand = sys.stderr - error_log = validate_mirri_excel(input_fhand, version=spec_version) - errors = error_log.get_errors() - if errors: - write_errors_in_screen(errors, out_fhand) - sys.exit(1) - - input_fhand.seek(0) - parsed_objects = parse_mirri_excel(input_fhand, version=spec_version) - strains = list(parsed_objects['strains']) - growth_media = list(parsed_objects['growth_media']) - - client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2', - client_id=args['client_id'], - client_secret=args['client_secret'], - username=args['user'], - password=args['password']) - for gm in growth_media: - try: - client.delete_by_name(GROWTH_MEDIUM_WS, gm.acronym) - except ValueError as error: - print(error) - continue - print(f'Growth medium {gm.acronym} deleted') - - for strain in strains: - ws_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id) - if ws_strain is not None: - client.delete_by_id(STRAIN_WS, ws_strain.record_id) - print(f'Strain {strain.id.strain_id} deleted') - else: - print(f'Strain {strain.id.strain_id} not in database') - - -if __name__ == '__main__': - main() diff --git a/bin/upload_strains_to_mirri_is.py b/bin/upload_strains_to_mirri_is.py deleted file mode 100644 index d6b7daf..0000000 --- a/bin/upload_strains_to_mirri_is.py +++ /dev/null @@ -1,182 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import sys -from collections import Counter - -from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium -from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.io.parsers.mirri_excel import parse_mirri_excel -from mirri.validation.excel_validator import validate_mirri_excel - -TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' -PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri' - - -def get_cmd_args(): - desc = "Upload strains to MIRRI-IS" - parser = argparse.ArgumentParser(description=desc) - parser.add_argument('-i', '--input', help='Validated Excel file', - type=argparse.FileType('rb'), required=True) - parser.add_argument('-v', '--spec_version', default='20200601', - help='Version of he specification of the given excel file') - parser.add_argument('-u', '--ws_user', help='Username of the web service', - required=True) - parser.add_argument('-p', '--ws_password', required=True, - help='Password of the web service user') - parser.add_argument('-c', '--client_id', required=True, - help='Client id of the web service') - parser.add_argument('-s', '--client_secret', required=True, - help='Client secret of the web service') - parser.add_argument('--force_update', required=False, - action='store_true', - help='Use it if you want to update the existing strains') - parser.add_argument('--verbose', action='store_true', - help='use it if you want a verbose output') - parser.add_argument('--prod', action='store_true', - help='Use production server') - parser.add_argument('--dont_add_gm', action='store_false', - help="Don't add growth media", default=True) - parser.add_argument('--dont_add_strains', action='store_false', - help="Don't add growth media", default=True) - parser.add_argument('--skip_first_num', type=int, - help='skip first X strains to the tool') - - args = parser.parse_args() - - return {'input_fhand': args.input, 'user': args.ws_user, - 'version': args.spec_version, - 'password': args.ws_password, 'client_id': args.client_id, - 'client_secret': args.client_secret, 'update': args.force_update, - 'verbose': args.verbose, 'use_production_server': args.prod, - 'add_gm': args.dont_add_gm, 'add_strains': args.dont_add_strains, - 'skip_first_num': args.skip_first_num} - - -def write_errors_in_screen(errors, fhand=sys.stderr): - for key, errors_by_type in errors.items(): - fhand.write(f'{key}\n') - fhand.write('-' * len(key) + '\n') - for error in errors_by_type: - if error.pk: - fhand.write(f'{error.pk}: ') - fhand.write(f'{error.message} - {error.code}\n') - fhand.write('\n') - - -def create_or_upload_strains(client, strains, update=False, counter=None, - out_fhand=None, seek=None): - for index, strain in enumerate(strains): - if seek is not None and index < seek: - continue - # if strain.id.strain_id != 'CECT 5766': - # continue - result = get_or_create_or_update_strain(client, strain, update=update) - - new_strain = result['record'] - created = result['created'] - updated = result.get('updated', False) - if updated: - result_state = 'updated' - elif created: - result_state = 'created' - else: - result_state = 'not modified' - if counter is not None: - counter[result_state] += 1 - if out_fhand is not None: - out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n') - # break - - -def create_or_upload_growth_media(client, growth_media, update=False, counter=None, - out_fhand=None): - - for gm in growth_media: - result = get_or_create_or_update_growth_medium(client, gm, update) - - new_gm = result['record'] - created = result['created'] - updated = result.get('updated', False) - if updated: - result_state = 'updated' - elif created: - result_state = 'created' - else: - result_state = 'not modified' - if counter is not None: - counter[result_state] += 1 - if out_fhand is not None: - out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n') - - -def main(): - args = get_cmd_args() - input_fhand = args['input_fhand'] - spec_version = args['version'] - out_fhand = sys.stdout - error_log = validate_mirri_excel(input_fhand, version=spec_version) - errors = error_log.get_errors() - skip_first_num = args['skip_first_num'] - if errors: - write_errors_in_screen(errors, out_fhand) - sys.exit(1) - - input_fhand.seek(0) - parsed_objects = parse_mirri_excel(input_fhand, version=spec_version) - strains = list(parsed_objects['strains']) - growth_media = list(parsed_objects['growth_media']) - - server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL - - client = BiolomicsMirriClient(server_url=server_url, api_version='v2', - client_id=args['client_id'], - client_secret=args['client_secret'], - username=args['user'], - password=args['password'], - verbose=args['verbose']) - - if args['add_gm']: - client.start_transaction() - counter = Counter() - try: - create_or_upload_growth_media(client, growth_media, update=args['update'], - counter=counter, out_fhand=out_fhand) - except (Exception, KeyboardInterrupt) as error: - out_fhand.write('There were some errors in the Growth media upload\n') - out_fhand.write(str(error) + '\n') - out_fhand.write('Rolling back\n') - client.rollback() - raise - client.finish_transaction() - show_stats(counter, 'Growth Media', out_fhand) - - if args['add_strains']: - client.start_transaction() - counter = Counter() - try: - create_or_upload_strains(client, strains, update=args['update'], - counter=counter, - out_fhand=out_fhand, seek=skip_first_num) - client.finish_transaction() - except (Exception, KeyboardInterrupt) as error: - out_fhand.write('There were some errors in the Strain upload\n') - out_fhand.write(str(error) + '\n') - out_fhand.write('rolling back\n') - # client.rollback() - raise - client.finish_transaction() - show_stats(counter, 'Strains', out_fhand) - - -def show_stats(counter, kind, out_fhand): - out_fhand.write(f'{kind}\n') - line = ''.join(['-'] * len(kind)) - out_fhand.write(f"{line}\n") - for kind2, value in counter.most_common(5): - out_fhand.write(f'{kind2}: {value}\n') - out_fhand.write('\n') - - -if __name__ == '__main__': - main() diff --git a/bin/upload_strains_to_mirri_is_NEWDB.py b/bin/upload_strains_to_mirri_is_NEWDB.py deleted file mode 100644 index a9a8e87..0000000 --- a/bin/upload_strains_to_mirri_is_NEWDB.py +++ /dev/null @@ -1,224 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from cmath import nan -import sys -from collections import Counter - -# -from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium -from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.io.parsers.mirri_excel import parse_mirri_excel -from mirri.validation.excel_validator import validate_mirri_excel - -##Database -from sqlalchemy import create_engine, MetaData -import pymysql -import pandas as pd -from pathlib import Path -import numpy as np - -# DEFINE THE DATABASE CREDENTIALS -user = 'mirridev' -password = 'estramboticandolotodo' -host = 'mirri-is.mirri.org' -port = 33066 -database = 'mirri-db' - -TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' -PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri' - -def show_stats(counter, kind, out_fhand): - out_fhand.write(f'{kind}\n') - line = ''.join(['-'] * len(kind)) - out_fhand.write(f"{line}\n") - for kind2, value in counter.most_common(5): - out_fhand.write(f'{kind2}: {value}\n') - out_fhand.write('\n') - - -def get_cmd_args(): - desc = "Upload strains to MIRRI-IS" - parser = argparse.ArgumentParser(description=desc) - parser.add_argument('-i' , '--input', required=True, help='Validated Excel file', type=argparse.FileType('rb')) - parser.add_argument('-v' , '--spec_version', default='20200601', help='Version of he specification of the given excel file') - parser.add_argument('-u' , '--ws_user', help='Username of the web service') - parser.add_argument('-p' , '--ws_password', required=True, help='Password of the web service user') - parser.add_argument('-c' , '--client_id', required=True, help='Client id of the web service') - parser.add_argument('-s' , '--client_secret', required=True, help='Client secret of the web service') - parser.add_argument('--force_update' , required=False, action='store_true', help='Use it if you want to update the existing strains') - parser.add_argument('--verbose' , action='store_true', help='use it if you want a verbose output') - parser.add_argument('--prod' , action='store_true', help='Use production server') - parser.add_argument('--dont_add_gm' , default=True, action='store_false', help="Don't add growth media") - parser.add_argument('--dont_add_strains', default=True, action='store_false', help="Don't add growth media") - parser.add_argument('--skip_first_num' , type=int, help='skip first X strains to the tool') - - args = parser.parse_args() - - return {'input_fhand': args.input - ,'user': args.ws_user - ,'version': args.spec_version - ,'password': args.ws_password - ,'client_id': args.client_id - ,'client_secret': args.client_secret - ,'update': args.force_update - ,'verbose': args.verbose - ,'use_production_server': args.prod - ,'add_gm': args.dont_add_gm - ,'add_strains': args.dont_add_strains - ,'skip_first_num': args.skip_first_num - } - - -def write_errors_in_screen(errors, fhand=sys.stderr): - for key, errors_by_type in errors.items(): - fhand.write(f'{key}\n') - fhand.write('-' * len(key) + '\n') - for error in errors_by_type: - if error.pk: - fhand.write(f'{error.pk}: ') - fhand.write(f'{error.message} - {error.code}\n') - fhand.write('\n') - - -def create_or_upload_strains(client, strains, update=False, counter=None, out_fhand=None, seek=None): - - for index, strain in enumerate(strains): - if seek is not None and index < seek: - continue - # if strain.id.strain_id != 'CECT 5766': - # continue - result = get_or_create_or_update_strain(client, strain, update=update) - - new_strain = result['record'] - created = result['created'] - updated = result.get('updated', False) - if updated: - result_state = 'updated' - elif created: - result_state = 'created' - else: - result_state = 'not modified' - - if counter is not None: - counter[result_state] += 1 - if out_fhand is not None: - out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n') - # break - - -def create_or_upload_growth_media(client, growth_media, update=False, counter=None, out_fhand=None): - - for gm in growth_media: - result = get_or_create_or_update_growth_medium(client, gm, update) - - new_gm = result['record'] - created = result['created'] - updated = result.get('updated', False) - if updated: - result_state = 'updated' - elif created: - result_state = 'created' - else: - result_state = 'not modified' - if counter is not None: - counter[result_state] += 1 - if out_fhand is not None: - out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n') - - -def get_connection(): - # PYTHON FUNCTION TO CONNECT TO THE MYSQL DATABASE AND - # RETURN THE SQLACHEMY ENGINE OBJECT - return create_engine(url="mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(user, password, host, port, database)) - -def main(): - ## Load Excel - path = Path('C://data//brclims_excel.xlsx') - Excel_Data = pd.read_excel(path, sheet_name = None) - cc_id=1 - - ## Load Database - sqlEngine = get_connection() - engine = sqlEngine.connect() - - ## Create new file upload - with engine.connect() as conn: - metaDats = MetaData(conn, schema=database) - metaDats.reflect(bind=conn) - table = metaDats.tables['mirri-db.file_upload'] - stmt = table.insert().values(filename=path.name,cc_id=cc_id) - aux=conn.execute(stmt).inserted_primary_key[0] - - ## Load all Sheet from Excel to DB - for key in Excel_Data.keys(): - #print(key) - n=Excel_Data[key].replace(np.nan, '', regex=True).astype(str) - n.columns = n.columns.str.replace(' ','_') - n['f_id']=aux - n.to_sql(key, engine, index=False, if_exists='append') - - - - -""" - args = get_cmd_args() - input_fhand = args['input_fhand'] - spec_version = args['version'] - out_fhand = sys.stdout - error_log = validate_mirri_excel(input_fhand, version=spec_version) - errors = error_log.get_errors() - skip_first_num = args['skip_first_num'] - if errors: - write_errors_in_screen(errors, out_fhand) - sys.exit(1) - - input_fhand.seek(0) - parsed_objects = parse_mirri_excel(input_fhand, version=spec_version) - strains = list(parsed_objects['strains']) - growth_media = list(parsed_objects['growth_media']) - - server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL - - client = BiolomicsMirriClient(server_url=server_url, api_version='v2', - client_id=args['client_id'], - client_secret=args['client_secret'], - username=args['user'], - password=args['password'], - verbose=args['verbose']) - - if args['add_gm']: - client.start_transaction() - counter = Counter() - try: - create_or_upload_growth_media(client, growth_media, update=args['update'], - counter=counter, out_fhand=out_fhand) - except (Exception, KeyboardInterrupt) as error: - out_fhand.write('There were some errors in the Growth media upload\n') - out_fhand.write(str(error) + '\n') - out_fhand.write('Rolling back\n') - client.rollback() - raise - client.finish_transaction() - show_stats(counter, 'Growth Media', out_fhand) - - if args['add_strains']: - client.start_transaction() - counter = Counter() - try: - create_or_upload_strains(client, strains, update=args['update'], - counter=counter, - out_fhand=out_fhand, seek=skip_first_num) - client.finish_transaction() - except (Exception, KeyboardInterrupt) as error: - out_fhand.write('There were some errors in the Strain upload\n') - out_fhand.write(str(error) + '\n') - out_fhand.write('rolling back\n') - # client.rollback() - raise - client.finish_transaction() - show_stats(counter, 'Strains', out_fhand) -""" - -if __name__ == '__main__': - main() diff --git a/bin/validateNEW.py b/bin/validateNEW.py deleted file mode 100644 index 923bc5c..0000000 --- a/bin/validateNEW.py +++ /dev/null @@ -1,21 +0,0 @@ -#!/usr/bin/env python -import sys -from pathlib import Path -from mirri.validation.excel_validator import validate_mirri_excel -import warnings -warnings.simplefilter("ignore") - - -def main(): - # path = Path(sys.argv[1]) - path = Path( 'C:/data/brclims_excel.xlsx') - - error_log = validate_mirri_excel(path.open("rb")) - - for errors in error_log.get_errors().values(): - for error in errors: - print(error.pk, error.message, error.code) - - -if __name__ == "__main__": - main() diff --git a/mirri/biolomics/__init__.py b/biolomics/__init__.py similarity index 100% rename from mirri/biolomics/__init__.py rename to biolomics/__init__.py diff --git a/mirri/biolomics/pipelines/__init__.py b/biolomics/pipelines/__init__.py similarity index 100% rename from mirri/biolomics/pipelines/__init__.py rename to biolomics/pipelines/__init__.py diff --git a/mirri/biolomics/pipelines/growth_medium.py b/biolomics/pipelines/growth_medium.py similarity index 100% rename from mirri/biolomics/pipelines/growth_medium.py rename to biolomics/pipelines/growth_medium.py diff --git a/mirri/biolomics/pipelines/strain.py b/biolomics/pipelines/strain.py similarity index 100% rename from mirri/biolomics/pipelines/strain.py rename to biolomics/pipelines/strain.py diff --git a/mirri/biolomics/remote/__init__.py b/biolomics/remote/__init__.py similarity index 100% rename from mirri/biolomics/remote/__init__.py rename to biolomics/remote/__init__.py diff --git a/mirri/biolomics/remote/biolomics_client.py b/biolomics/remote/biolomics_client.py similarity index 100% rename from mirri/biolomics/remote/biolomics_client.py rename to biolomics/remote/biolomics_client.py diff --git a/mirri/biolomics/remote/endoint_names.py b/biolomics/remote/endoint_names.py similarity index 100% rename from mirri/biolomics/remote/endoint_names.py rename to biolomics/remote/endoint_names.py diff --git a/mirri/biolomics/remote/rest_client.py b/biolomics/remote/rest_client.py similarity index 100% rename from mirri/biolomics/remote/rest_client.py rename to biolomics/remote/rest_client.py diff --git a/mirri/biolomics/serializers/__init__.py b/biolomics/serializers/__init__.py similarity index 100% rename from mirri/biolomics/serializers/__init__.py rename to biolomics/serializers/__init__.py diff --git a/mirri/biolomics/serializers/bibliography.py b/biolomics/serializers/bibliography.py similarity index 100% rename from mirri/biolomics/serializers/bibliography.py rename to biolomics/serializers/bibliography.py diff --git a/mirri/biolomics/serializers/growth_media.py b/biolomics/serializers/growth_media.py similarity index 100% rename from mirri/biolomics/serializers/growth_media.py rename to biolomics/serializers/growth_media.py diff --git a/mirri/biolomics/serializers/locality.py b/biolomics/serializers/locality.py similarity index 100% rename from mirri/biolomics/serializers/locality.py rename to biolomics/serializers/locality.py diff --git a/mirri/biolomics/serializers/ontobiotope.py b/biolomics/serializers/ontobiotope.py similarity index 100% rename from mirri/biolomics/serializers/ontobiotope.py rename to biolomics/serializers/ontobiotope.py diff --git a/mirri/biolomics/serializers/sequence.py b/biolomics/serializers/sequence.py similarity index 100% rename from mirri/biolomics/serializers/sequence.py rename to biolomics/serializers/sequence.py diff --git a/mirri/biolomics/serializers/strain.py b/biolomics/serializers/strain.py similarity index 100% rename from mirri/biolomics/serializers/strain.py rename to biolomics/serializers/strain.py diff --git a/mirri/biolomics/serializers/taxonomy.py b/biolomics/serializers/taxonomy.py similarity index 100% rename from mirri/biolomics/serializers/taxonomy.py rename to biolomics/serializers/taxonomy.py diff --git a/mirri/biolomics/settings.py b/biolomics/settings.py similarity index 100% rename from mirri/biolomics/settings.py rename to biolomics/settings.py diff --git a/mirri/data/ontobiotopes.csv b/data/ontobiotopes.csv similarity index 100% rename from mirri/data/ontobiotopes.csv rename to data/ontobiotopes.csv diff --git a/docs/Error_Log_Style_Sheet.docx b/docs/Error_Log_Style_Sheet.docx deleted file mode 100644 index 0aa7af8..0000000 Binary files a/docs/Error_Log_Style_Sheet.docx and /dev/null differ diff --git a/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf b/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf deleted file mode 100644 index 4ebbc8b..0000000 Binary files a/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf and /dev/null differ diff --git a/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf b/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf deleted file mode 100644 index 2eda93b..0000000 Binary files a/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf and /dev/null differ diff --git a/mirri/entities/__init__.py b/entities/__init__.py similarity index 100% rename from mirri/entities/__init__.py rename to entities/__init__.py diff --git a/mirri/entities/_private_classes.py b/entities/_private_classes.py similarity index 100% rename from mirri/entities/_private_classes.py rename to entities/_private_classes.py diff --git a/mirri/entities/date_range.py b/entities/date_range.py similarity index 100% rename from mirri/entities/date_range.py rename to entities/date_range.py diff --git a/mirri/entities/growth_medium.py b/entities/growth_medium.py similarity index 100% rename from mirri/entities/growth_medium.py rename to entities/growth_medium.py diff --git a/mirri/entities/location.py b/entities/location.py similarity index 100% rename from mirri/entities/location.py rename to entities/location.py diff --git a/mirri/entities/publication.py b/entities/publication.py similarity index 100% rename from mirri/entities/publication.py rename to entities/publication.py diff --git a/mirri/entities/sequence.py b/entities/sequence.py similarity index 100% rename from mirri/entities/sequence.py rename to entities/sequence.py diff --git a/mirri/entities/strain.py b/entities/strain.py similarity index 100% rename from mirri/entities/strain.py rename to entities/strain.py diff --git a/mirri/io/__init__.py b/io/__init___1.py similarity index 100% rename from mirri/io/__init__.py rename to io/__init___1.py diff --git a/mirri/io/parsers/__init__.py b/io/parsers/__init__.py similarity index 100% rename from mirri/io/parsers/__init__.py rename to io/parsers/__init__.py diff --git a/io/parsers/__pycache__/__init__.cpython-311.pyc b/io/parsers/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..a3a1711 Binary files /dev/null and b/io/parsers/__pycache__/__init__.cpython-311.pyc differ diff --git a/io/parsers/__pycache__/excel.cpython-311.pyc b/io/parsers/__pycache__/excel.cpython-311.pyc new file mode 100644 index 0000000..5189d49 Binary files /dev/null and b/io/parsers/__pycache__/excel.cpython-311.pyc differ diff --git a/mirri/io/parsers/excel.py b/io/parsers/excel.py similarity index 100% rename from mirri/io/parsers/excel.py rename to io/parsers/excel.py diff --git a/mirri/io/parsers/mirri_excel.py b/io/parsers/mirri_excel.py similarity index 83% rename from mirri/io/parsers/mirri_excel.py rename to io/parsers/mirri_excel.py index 549ae3d..d65d1ce 100644 --- a/mirri/io/parsers/mirri_excel.py +++ b/io/parsers/mirri_excel.py @@ -19,6 +19,12 @@ from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO, NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, ONLY_RESEARCH, ONTOBIOTOPE, PUBLICATION_FIELDS, STRAINS, SUBTAXAS) +from mirri.settings_v1 import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO, + GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS, + MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS, + NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, + ONLY_RESEARCH, ONTOBIOTOPE, + PUBLICATION_FIELDS, STRAINS, SUBTAXAS) from mirri.utils import get_country_from_name RESTRICTION_USE_TRANSLATOR = { @@ -37,11 +43,13 @@ TRUEFALSE_TRANSLATOR = { } -def parse_mirri_excel(fhand, version="20200601"): - if version == "20200601": +def parse_mirri_excel(fhand, version=""): + if version == "20200602": return _parse_mirri_v20200601(fhand) + elif version == "12052023": + return _parse_mirri_v12052023(fhand) else: - raise NotImplementedError("Only version 20200601 is implemented") + raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") def _parse_mirri_v20200601(fhand): @@ -64,6 +72,25 @@ def _parse_mirri_v20200601(fhand): return {"strains": strains, "growth_media": growth_media} +def _parse_mirri_v12052023(fhand): + fhand.seek(0) + file_content = BytesIO(fhand.read()) + wb = load_workbook(filename=file_content, read_only=True, data_only=True) + + locations = workbook_sheet_reader(wb, LOCATIONS) + ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE) + + growth_media = list(parse_growth_media(wb)) + + markers = workbook_sheet_reader(wb, GENOMIC_INFO) + + publications = list(parse_publications(wb)) + + strains = parse_strains(wb, locations=locations, growth_media=growth_media, + markers=markers, publications=publications, + ontobiotopes=ontobiotopes) + + return {"strains": strains, "growth_media": growth_media} def index_list_by(list_, id_): return {str(item[id_]): item for item in list_} @@ -125,7 +152,7 @@ def parse_strains(wb, locations, growth_media, markers, publications, publications = index_list_by_attr(publications, 'id') markers = index_markers(markers) - for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"): + for strain_row in workbook_sheet_reader(wb, STRAINS, "accessionNumber"): strain = StrainMirri() strain_id = None label = None @@ -140,7 +167,7 @@ def parse_strains(wb, locations, growth_media, markers, publications, collection, number = value.split(" ", 1) value = StrainId(collection=collection, number=number) rsetattr(strain, attribute, value) - + elif attribute == "restriction_on_use": rsetattr(strain, attribute, RESTRICTION_USE_TRANSLATOR[value]) elif attribute == "nagoya_protocol": @@ -202,9 +229,19 @@ def parse_strains(wb, locations, growth_media, markers, publications, items = value.split(";") strain.collect.location.latitude = float(items[0]) strain.collect.location.longitude = float(items[1]) + strain.collect.location.precision = float(items[2]) + strain.collect.location.altitude = float(items[3]) + if len(items) > 4: + strain.collect.location.coord_uncertainty = items[4] + + elif attribute == "collect.site.links": + items = value.split(";") + strain.collect.site.links.nameSite = str(items[0]) + strain.collect.site.links.urlSite = str(items[1]) + rsetattr(strain, attribute, value.split(";")) #ver o separador if len(items) > 2: - strain.collect.location.coord_uncertainty = items[2] - + strain.collect.site.links.site_uncertainty = items[2] + elif attribute == "collect.location": location = locations[value] if 'Country' in location and location['Country']: diff --git a/mirri/io/writers/__init__.py b/io/writers/__init__.py similarity index 100% rename from mirri/io/writers/__init__.py rename to io/writers/__init__.py diff --git a/mirri/io/writers/mirri_excel.py b/io/writers/mirri_excel.py similarity index 76% rename from mirri/io/writers/mirri_excel.py rename to io/writers/mirri_excel.py index b4cb4ac..89e0b4a 100644 --- a/mirri/io/writers/mirri_excel.py +++ b/io/writers/mirri_excel.py @@ -5,6 +5,7 @@ from openpyxl.workbook.workbook import Workbook from mirri import rgetattr from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS +from mirri.settings_v1 import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR INITIAL_SEXUAL_STATES = [ @@ -53,7 +54,10 @@ def write_mirri_excel(path, strains, growth_media, version): if version == "20200601": _write_mirri_excel_20200601(path, strains, growth_media) + if version == "12052023": + _write_mirri_excel_12052023(path, strains, growth_media) + def _write_mirri_excel_20200601(path, strains, growth_media): wb = Workbook() @@ -104,7 +108,7 @@ def _write_mirri_excel_20200601(path, strains, growth_media): redimension_cell_width(pub_sheet) # write sexual states - sex_sheet = wb.create_sheet("Sexual states") + sex_sheet = wb.create_sheet("Sexual state") for sex_state in sorted(list(sexual_states)): sex_sheet.append([sex_state]) redimension_cell_width(sex_sheet) @@ -122,6 +126,73 @@ def _write_mirri_excel_20200601(path, strains, growth_media): wb.save(str(path)) +def _write_mirri_excel_12052023(path, strains, growth_media): + wb = Workbook() + + write_markers_sheet(wb) + + ontobiotope_path = DATA_DIR / "ontobiotopes.csv" + write_ontobiotopes(wb, ontobiotope_path) + + write_growth_media(wb, growth_media) + growth_media_indexes = [str(gm.acronym) for gm in growth_media] + + locations = {} + publications = {} + sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES)) + genomic_markers = {} + strains_data = _deserialize_strains(strains, locations, growth_media_indexes, + publications, sexual_states, genomic_markers) + strains_data = list(strains_data) + + # write strain to generate indexed data + strain_sheet = wb.create_sheet("Strains") + strain_sheet.append([field["label"] for field in MIRRI_FIELDS]) + for strain_row in strains_data: + strain_sheet.append(strain_row) + redimension_cell_width(strain_sheet) + + # write locations + loc_sheet = wb.create_sheet("Geographic origin") + loc_sheet.append(["ID", "Country", "Region", "City", "Locality"]) + for index, loc_index in enumerate(locations.keys()): + location = locations[loc_index] + row = [index, location.country, location.state, location.municipality, + loc_index] + loc_sheet.append(row) + redimension_cell_width(loc_sheet) + + # write publications + pub_sheet = wb.create_sheet("Literature") + pub_sheet.append(PUB_HEADERS) + for publication in publications.values(): + row = [] + for pub_field in PUBLICATION_FIELDS: + # if pub_field['attribute'] == 'id': + # value = index + value = getattr(publication, pub_field['attribute'], None) + row.append(value) + pub_sheet.append(row) + redimension_cell_width(pub_sheet) + + # write sexual states + sex_sheet = wb.create_sheet("Sexual state") + for sex_state in sorted(list(sexual_states)): + sex_sheet.append([sex_state]) + redimension_cell_width(sex_sheet) + + # write genetic markers + markers_sheet = wb.create_sheet("Genomic information") + markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence']) + for strain_id, markers in genomic_markers.items(): + for marker in markers: + row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq] + markers_sheet.append(row) + redimension_cell_width(markers_sheet) + + del wb["Sheet"] + wb.save(str(path)) + def _deserialize_strains(strains, locations, growth_media_indexes, publications, sexual_states, genomic_markers): for strain in strains: @@ -189,10 +260,21 @@ def _deserialize_strains(strains, locations, growth_media_indexes, elif attribute == "collect.location.coords": lat = strain.collect.location.latitude long = strain.collect.location.longitude - if lat is not None and long is not None: - value = f"{lat};{long}" + alt = strain.collect.location.altitude + prec = strain.collect.location.precision + if lat is not None and long is not None and prec is not None and alt is not None: + value = f"{lat};{long};{prec};{alt}" else: - value = None + value = None + elif attribute == "collect.site.links": + name = strain.collect.site.links.nameSite + url = strain.collect.site.links.urlSite + value = rgetattr(strain, attribute) + value = ";".join(value) + if name is not None and url is not None: + value = f"{name};{url}" + else: + value = None elif attribute == "collect.location": location = strain.collect.location diff --git a/mirri - v20230224/mirri-V20230324.zip b/mirri - v20230224/mirri-V20230324.zip deleted file mode 100644 index 371814f..0000000 Binary files a/mirri - v20230224/mirri-V20230324.zip and /dev/null differ diff --git a/mirri/validation/entity_validators.py b/mirri/validation/entity_validators.py deleted file mode 100644 index e1e02d0..0000000 --- a/mirri/validation/entity_validators.py +++ /dev/null @@ -1,50 +0,0 @@ -from mirri import rgetattr - - -def validate_strain(strain, version='20200601'): - if version == '20200601': - return _validate_strain_v20200601(strain) - raise NotImplementedError('Only v20200601 is implemented') - - -def _validate_strain_v20200601(strain): - mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'}, - {'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'}, - {'label': 'Restriction on use', 'attr': 'restriction_on_use'}, - {'label': 'Risk group', 'attr': 'risk_group'}, - {'label': 'Organism type', 'attr': 'taxonomy.organism_type'}, - {'label': 'Taxon name', 'attr': 'taxonomy.long_name'}, - {'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'}, - {'label': 'Recommended media', 'attr': 'growth.recommended_media'}, - {'label': 'Form of supply', 'attr': 'form_of_supply'}, - {'label': 'Country', 'attr': 'collect.location.country'}] - - errors = [] - - for mandatory in mandatory_attrs: - value = rgetattr(strain, mandatory['attr']) - if value is None: - errors.append(f"{mandatory['label']} is mandatory field") - - if not is_valid_nagoya(strain): - errors.append('Not compliant wih nagoya protocol requirements') - - return errors - - -def is_valid_nagoya(strain): - # nagoya_requirements - _date = strain.collect.date - if _date is None: - _date = strain.isolation.date - if _date is None: - _date = strain.deposit.date - if _date is None: - _date = strain.catalog_inclusion_date - # print(_date) - year = None if _date is None else _date._year - - if year is not None and year >= 2014 and strain.collect.location.country is None: - return False - - return True diff --git a/mirri/validation/error_logging/error_message.py b/mirri/validation/error_logging/error_message.py deleted file mode 100644 index 7188a9b..0000000 --- a/mirri/validation/error_logging/error_message.py +++ /dev/null @@ -1,414 +0,0 @@ -from typing import Optional - - -class ErrorMessage(): - """Error message - - Args: - code (str): Error code. - pk (str | optional): The instance's primary key that triggered the error. Defaults to None. - value (str | optional): The instance's value that triggered the error. Defaults to None. - """ - - def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None): - self.code = code.upper() - self.pk = pk - self.value = value - - @property - def _codes(self) -> list: - return [ - func - for func in dir(self) - if func.isupper() and - callable(getattr(self, func)) and - not func.startswith("__") - ] - - @property - def _messages(self) -> dict: - return {code: getattr(self, code) for code in self._codes} - - @property - def message(self) -> str: - if not self._validate_code(): - raise ValueError(f"{self.code} not found") - return self._messages[self.code]() - - @property - def code(self) -> str: - return self._code - - @code.setter - def code(self, code: str) -> None: - self._code = code.upper() - - def _validate_code(self) -> bool: - return self.code in self._codes - - @property - def pk(self) -> str: - return self._pk - - @pk.setter - def pk(self, pk: str) -> None: - self._pk = pk - - @property - def value(self) -> str: - return self._value - - @value.setter - def value(self, value: str) -> None: - self._value = value - - """ - Excel File Structure Error Codes - """ - - def EXL00(self): - return f"The provided file '{self.pk}' is not an excel(xlsx) file" - - def EFS01(self): - return "The 'Growth media' sheet is missing. Please check the provided excel template." - - def EFS02(self): - return "The 'Geographic origin' sheet is missing. Please check the provided excel template." - - def EFS03(self): - return "The 'Literature' sheet is missing. Please check the provided excel template." - - def EFS04(self): - return "The 'Sexual state' sheet is missing. Please check the provided excel template." - - def EFS05(self): - return "The 'Strains' sheet is missing. Please check the provided excel template." - - def EFS06(self): - return "The 'Ontobiotope' sheet is missing. Please check the provided excel template." - - def EFS07(self): - return "The 'Markers' sheet is missing. Please check the provided excel template." - - def EFS08(self): - return "The 'Genomic information' sheet is missing. Please check the provided excel template." - - """ - Growth Media Error Codes - """ - - def GMD01(self): - return "The 'Acronym' column is a mandatory field in the Growth Media sheet." - - def GMD02(self): - return "The 'Acronym' column is empty or has missing values." - - def GMD03(self): - return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty." - - def GMD04(self): - return f"The 'Description' for growth media with Acronym {self.pk} is missing." - - """ - Geographic Origin Error Codes - """ - - def GOD01(self): - return "The 'ID' column is a mandatory field in the Geographic Origin sheet." - - def GOD02(self): - return "The 'ID' column is empty or has missing values." - - def GOD03(self): - return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." - - def GOD04(self): - return f"The 'Country' for geographic origin with ID {self.pk} is missing." - - def GOD05(self): - return f"The 'Country' for geographic origin with ID {self.pk} is incorrect." - - def GOD06(self): - return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." - - def GOD07(self): - return f"The 'Locality' for geographic origin with ID {self.pk} is missing." - - """ - Literature Error Codes - """ - - def LID01(self): - return "The 'ID' column is a mandatory field in the Literature sheet." - - def LID02(self): - return "The 'ID' column empty or missing values." - - def LID03(self): - return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID04(self): - return f"The 'Full reference' for literature with ID {self.pk} is missing." - - def LID05(self): - return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID06(self): - return f"The 'Authors' for literature with ID {self.pk} is missing." - - def LID07(self): - return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID08(self): - return f"The 'Title' for literature with ID {self.pk} is missing." - - def LID09(self): - return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID10(self): - return f"The 'Journal' for literature with ID {self.pk} is missing." - - def LID11(self): - return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID12(self,): - return f"The 'Year' for literature with ID {self.pk} is missing." - - def LID13(self): - return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty." - - def LID14(self): - return f"The 'Volume' for literature with ID {self.pk} is missing." - - def LID15(self): - return "The 'First page' column is a mandatory field. The column can not be empty." - - def LID16(self): - return f"The 'First page' for literature with ID {self.pk} is missing." - - def LID17(self): - msg = 'If journal; Title, Authors, journal, year and first page are required' - msg += 'If Book; Book Title, Authors, Year, Editors, Publishers' - return msg - - """ - Strains Error Codes - """ - - def STD01(self): - return "The 'Accession number' column is a mandatory field in the Strains sheet." - - def STD02(self): - return "The 'Accession number' column is empty or has missing values." - - def STD03(self): - return f"The 'Accesion number' must be unique. The '{self.value}' is repeated." - - def STD04(self): - return (f"The 'Accession number' {self.pk} is not according to the specification." - " The value must be of the format ' '.") - - def STD05(self): - return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD06(self): - return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing." - - def STD07(self): - return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification." - f" Your value is {self.value} and the accepted values are 1, 2, 3.") - - def STD08(self): - return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD09(self): - return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing." - - def STD10(self): - return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification." - f" Your value is {self.value} and the accepted values are 1, 2, 3.") - - def STD11(self): - return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2, 3.") - - def STD12(self): - return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD13(self): - return f"The 'Risk group' for strain with Accession Number {self.pk} is missing." - - def STD14(self): - return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.") - - def STD15(self): - return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2.") - - def STD16(self): - return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2.") - - def STD17(self): - return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD18(self): - return f"The 'Organism type' for strain with Accession Number {self.pk} is missing." - - def STD19(self): - return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', " - "'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.") - - def STD20(self): - return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD21(self): - return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing." - - def STD22(self): - return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect." - - def STD23(self): - return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2.") - - def STD24(self): - return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect." - - def STD25(self): - return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect." - " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") - - def STD26(self): - return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect." - " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") - - def STD27(self): - return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect." - " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") - - def STD28(self): - return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect." - " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") - - def STD29(self): - return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect." - " It must have two decimal numbers separated by ','") - - def STD30(self): - return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD31(self): - return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing." - - def STD32(self): - return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect." - " It must have two decimal numbers separated by ','.") - - def STD33(self): - return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD34(self): - return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing." - - def STD35(self): - return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet." - - def STD36(self): - return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty." - - def STD37(self): - return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing." - - def STD38(self): - return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet." - - def STD39(self): - return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect." - "The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be" - "between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.") - - def STD40(self): - return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect." - "The allowed formats are one decimal number between [-200, 8000].") - - def STD41(self): - return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet." - - def STD42(self): - return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 1, 2") - - def STD43(self): - return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', " - "'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'") - - def STD44(self): - return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification." - f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9") - - def STD45(self): - msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. " - msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage" - return msg - - def STD46(self): - msg = f"If date of collection/isolation/deposit/inclusion in the catalog is after 2014," \ - f" the value of column Geographic Origin must be provided and associated with a country in the " \ - f"Geographic Origin sheet. The value is missing or not associated with a country for strain {self.pk}." - return msg - - - """ - Genomic Information Error Codes - """ - - def GID01(self): - return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet." - - def GID02(self): - return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values." - - def GID03(self): - return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet." - - def GID04(self): - return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." - - def GID05(self): - return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing." - - def GID06(self): - return f"The value of 'Marker' {self.value} is not in the Markers sheet." - - def GID07(self): - return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." - - def GID08(self): - return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing." - - def GID09(self): - return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect." - - def GID10(self): - return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." - " It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.") - - """ - Ontobiotope Error Codes - """ - - def OTD01(self): - return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet." - - def OTD02(self): - return "The 'ID' columns is empty or has missing values." - - def OTD03(self): - return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty." - - def OTD04(self): - return f"The 'Name' for ontobiotope with ID {self.pk} is missing." diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 4d91e63..0000000 --- a/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -openpyxl -requests -requests_oauthlib -pycountry -deepdiff diff --git a/settings.py b/settings.py new file mode 100644 index 0000000..adca2e7 --- /dev/null +++ b/settings.py @@ -0,0 +1,339 @@ +from pathlib import Path + +DATA_DIR = Path(__file__).parent / "data" + +ACCESSION_NUMBER = "accession_number" +MIRRI_ACCESSION_NUMBER = 'mirri_accession_number' +RESTRICTION_ON_USE = "restriction_on_use" +NAGOYA_PROTOCOL = "nagoya_protocol" +ABS_RELATED_FILES = "abs_related_files" +MTA_FILES = "mta_file" +OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers" +STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection" +RISK_GROUP = "risk_group" +DUAL_USE = "dual_use" +QUARANTINE = "quarantine" +ORGANISM_TYPE = "organism_type" +TAXON_NAME = "taxon_name" +TYPE = "type" +INFRASUBSPECIFIC_NAME = "infrasubspecific_names" +COMMENTS_ON_TAXONOMY = "comments_on_taxonomy" +STATUS = "status" +HISTORY_OF_DEPOSIT = "history_of_deposit" +DEPOSITOR = "depositor" +DATE_OF_DEPOSIT = "date_of_deposit" +COLLECTED_BY = "collected_by" +DATE_OF_COLLECTION = "date_of_collection" +ISOLATED_BY = "isolated_by" +DATE_OF_ISOLATION = "date_of_isolation" +DATE_OF_INCLUSION = "date_of_inclusion_on_catalog" +TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range" +RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature" +RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth" +FORM_OF_SUPPLY = "form_of_supply" +GEO_COORDS = "coordinates_of_geographic_origin" +ACCESSION_NAME = "other_denomination" +ALTITUDE = "altitude_of_geographic_origin" +GEOGRAPHIC_ORIGIN = "geographic_origin" +GMO = "gmo" +GMO_CONSTRUCTION_INFO = "gmo_construction_information" +MUTANT_INFORMATION = "mutant_information" +GENOTYPE = "genotype" +LITERATURE = "literature" +SEXUAL_STATE = "sexual_state" +PLOIDY = "ploidy" +INTERSPECIFIC_HYBRID = "interspecific_hybrid" +HYBRIDS = 'hybrids' +PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code" +PATHOGENICITY = "pathogenicity" +ENZYME_PRODUCTION = "enzyme_production" +PRODUCTION_OF_METABOLITES = "production_of_metabolites" +APPLICATIONS = "applications" +REMARKS = "remarks" +PLASMIDS = "plasmids" +PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields" +SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation" +ISOLATION_HABITAT = "isolation_habitat" +ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat" +LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome" +AXENIC_CULTURE = "axenic_culture" +QPS ="qps" +SITE_LINK = "site_links" + +# StrainId +STRAIN_ID = "id" +COLLECTION_CODE = "collection_code" +STRAIN_PUI = "strain_pui" +STRAIN_URL = "strain_url" + +ID_SYNONYMS = 'id_synonyms' +# Taxonomy +GENUS = "genus" +SPECIES = "species" + +# Location +COUNTRY = "countryOfOriginCode" +SITE = "site" +STATE = "state" +PROVINCE = "province" +MUNICIPALITY = "municipality" +ISLAND = "island" +OTHER = "other" +LATITUDE = "latitude" +LONGITUDE = "longitude" +ALTITUDE = "altitude" +GEOREF_METHOD = "georeferencingMethod" +COORDUNCERTAINTY = "coordUncertainty" +COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference" +LOCATION = "location" + +ALLOWED_COLLECTING_SITE_KEYS = [ + COUNTRY, + STATE, + PROVINCE, + ISLAND, + MUNICIPALITY, + OTHER, + SITE, + LATITUDE, + LONGITUDE, + ALTITUDE, + GEOREF_METHOD, + COORDUNCERTAINTY, + COORD_SPATIAL_REFERENCE, +] + +MIRRI_FIELDS = [ + {"attribute": "id", "label": "accessionNumber"}, + {"attribute": "mirri_accession_number", "label": "mirriAccessionNumber"}, + {"attribute": "qps", "label": "qps"}, + {"attribute": "axenic_culture", "label": "axenicCulture"}, + {"attribute": "restriction_on_use", "label": "useRestrictions"}, + {"attribute": "nagoya_protocol", + "label": "nagoyaConditions"}, + {"attribute": ABS_RELATED_FILES, "label": "absFile"}, + {"attribute": "mta_files", "label": "mtaFile"}, + {"attribute": "other_numbers", "label": "otherCollectionNumbers"}, + {"attribute": "is_from_registered_collection", + "label": "registeredCollection"}, + {"attribute": "risk_group", "label": "riskGroup"}, + {"attribute": "is_potentially_harmful", "label": "dualUse"}, + {"attribute": "is_subject_to_quarantine", "label": "euQuarantine"}, + {"attribute": "taxonomy.organism_type", "label": "organismType"}, + {"attribute": "taxonomy.taxon_name", "label": "speciesName"}, + {"attribute": "taxonomy.infrasubspecific_name", + "label": "infrasubspecificNames"}, + {"attribute": "taxonomy.comments", "label": "taxonomyComments"}, + {"attribute": "taxonomy.interspecific_hybrid", + "label": "hybrid"}, + {"attribute": "status", "label": "status"}, + {"attribute": "history", "label": "depositHistory", }, + {"attribute": "deposit.who", "label": "depositor"}, + {"attribute": "deposit.date", "label": "depositDate"}, + {"attribute": "catalog_inclusion_date", + "label": "accessionDate"}, + {"attribute": "collect.who", "label": "collector"}, + {"attribute": "collect.date", "label": "collectionDate"}, + {"attribute": "isolation.who", "label": "isolator"}, + {"attribute": "isolation.date", "label": "isolationDate"}, + {"attribute": "isolation.substrate_host_of_isolation", + "label": "substrate"}, + {"attribute": "growth.tested_temp_range", + "label": "temperatureGrowthRange"}, + {"attribute": "growth.recommended_temp", + "label": "recommendedTemperature"}, + {"attribute": "growth.recommended_media", + "label": "recommendedMedium"}, + {"attribute": "form_of_supply", "label": "supplyForms"}, + {"attribute": "other_denominations", "label": "otherDenomination"}, + {"attribute": "collect.location.coords", + "label": "geographicCoordinates"}, + {"attribute": "collect.site.links", + "label": "siteLinks"}, + {"attribute": "collect.location.altitude", + "label": "country"}, + {"attribute": "collect.location", "label": "geographicOrigin"}, + {"attribute": "collect.habitat", "label": "isolationHabitat"}, + {"attribute": "collect.habitat_ontobiotope", + "label": "ontobiotopeTerms"}, + {"attribute": "genetics.gmo", "label": "gmo"}, + {"attribute": "genetics.gmo_construction", + "label": "gmoConstruction"}, + {"attribute": "genetics.mutant_info", "label": "mutant"}, + {"attribute": "genetics.genotype", "label": "genotype"}, + {"attribute": "genetics.sexual_state", "label": "sexualState"}, + {"attribute": "genetics.ploidy", "label": "ploidy"}, + {"attribute": "genetics.plasmids", "label": "plasmids"}, + {"attribute": "genetics.plasmids_in_collections", + "label": "plasmidCollections"}, + {"attribute": "publications", "label": "identificationLiterature"}, + {"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"}, + {"attribute": "pathogenicity", "label": "pathogenicity"}, + {"attribute": "enzyme_production", "label": "enzymes"}, + {"attribute": "production_of_metabolites", + "label": "metabolites"}, + {"attribute": "type", + "label": "type"}, + {"attribute": "applications", "label": "applications", }, + {"attribute": "remarks", "label": "remarks"}, + {"attribute": LITERATURE_LINKED_TO_SEQ_GENOME, + "label": "sequenceLiterature"}, +] + +ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma", + 'forma.specialis'] +ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA + +# nagoya +NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol" +NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection" +NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection" + +ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS, + NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE] + +# Use restriction +NO_RESTRICTION = "no_restriction" +ONLY_RESEARCH = "only_research" +COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement" + +ALLOWED_RESTRICTION_USE_OPTIONS = [ + NO_RESTRICTION, + ONLY_RESEARCH, + COMMERCIAL_USE_WITH_AGREEMENT, +] + +ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"] + +AGAR = "Agar" +CRYO = "Cryo" +DRY_ICE = "Dry Ice" +LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium" +LYO = "Lyo" +OIL = "Oil" +WATER = "Water" +ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE, + LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER] + +DEPOSIT = "deposit" +ISOLATION = "isolation" +COLLECT = "collect" +GROWTH = "growth" +GENETICS = "genetics" +TAXONOMY = "taxonomy" +# Markers +MARKERS = "markers" +MARKER_TYPE = "marker_type" +MARKER_INSDC = "INSDC" +MARKER_SEQ = "marker_seq" +ALLOWED_MARKER_TYPES = [ + {"acronym": "16S rRNA", "marker": "16S rRNA"}, + {"acronym": "ACT", "marker": "Actin"}, + {"acronym": "CaM", "marker": "Calmodulin"}, + {"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"}, + {"acronym": "ITS", + "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"}, + {"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"}, + {"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"}, + {"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"}, + {"acronym": "TUBB", "marker": "β-Tubulin"}, +] + +PUBLICATIONS = "publications" +PUB_ID = "pub_id" +PUB_DOI = "pub_doi" +PUB_PMID = "pub_pmid" +PUB_PUBMED_ID = '' +PUB_FULL_REFERENCE = "full_reference" +PUB_TITLE = "title" +PUB_AUTHORS = "authors" +PUB_JOURNAL = "journal" +PUB_YEAR = "year" +PUB_VOLUME = "volume" +PUB_ISSUE = "issue" +PUB_FIRST_PAGE = "first_page" +PUB_LAST_PAGE = "last_page" +BOOK_TITLE = "book_title" +BOOK_EDITOR = "book_editor" +BOOK_PUBLISHER = "book_publisher" + + +PUBLICATION_FIELDS = [ + {"label": "ID", "attribute": PUB_ID}, + {"label": "PMID", "attribute": PUB_PMID}, + {"label": "DOI", "attribute": PUB_DOI}, + {"label": "Full reference", "attribute": PUB_FULL_REFERENCE}, + {"label": "Authors", "attribute": PUB_AUTHORS}, + {"label": "Title", "attribute": PUB_TITLE}, + {"label": "Journal", "attribute": PUB_JOURNAL}, + {"label": "Year", "attribute": PUB_YEAR}, + {"label": "Volume", "attribute": PUB_VOLUME}, + {"label": "Issue", "attribute": PUB_ISSUE}, + {"label": "First page", "attribute": PUB_FIRST_PAGE}, + {"label": "Last page", "attribute": PUB_FIRST_PAGE}, + {"label": "Book title", "attribute": BOOK_TITLE}, + {"label": "Editors", "attribute": BOOK_EDITOR}, + {"label": "Publisher", "attribute": BOOK_PUBLISHER}, +] + + +# ploidy +ANEUPLOID = 0 +HAPLOID = 1 +DIPLOID = 2 +TRIPLOID = 3 +TETRAPLOID = 4 +POLYPLOID = 9 + +ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID, + POLYPLOID] + +SUBTAXAS = { + "subsp.": "subspecies", + "var.": "variety", + "convar.": "convarietas", + "group.": "group", + "f.": "forma", + "f.sp.": "forma.specialis" +} + +#Control +VERSION = "Version" +DATE = "Date" + + +#Country codes +COUNTRY = "Country" +CODE = "Code" +ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE = "Additional information on the country or code" + + +#Country codes files +COUNTRY_CODES_SHEET = [ + {"label": "Country", "attribute": COUNTRY}, + {"label": "Code", "attribute": CODE}, + {"label": "Additional information on the country or code", "attribute": ADDITIONAL_INFORMATION_ON_THE_COUNTRY_OR_CODE}, +] + + +#Controle files +CONTROL_FIELDS = [ + {"label": "Version", "attribute": VERSION}, + {"label": "Date", "attribute": DATE}, +] + +# Excel sheet name +LOCATIONS = "Geographic origin" # 'Locations' +GROWTH_MEDIA = "Growth media" +GENOMIC_INFO = "Genomic information" +STRAINS = "Strains" +LITERATURE_SHEET = "Literature" +SEXUAL_STATE_SHEET = "Sexual state" +RESOURCE_TYPES_VALUES = "Resource types values" +FORM_OF_SUPPLY_SHEET = "Forms of supply" +PLOIDY_SHEET = "Ploidy" +ONTOBIOTOPE = "Ontobiotope" +MARKERS = "Markers" +CONTROL_SHEET = "Version" +COUNTRY_CODES_SHEET = "Country codes" +RESOURCE_SHEET = 'Resource types values' diff --git a/mirri/settings.py b/settings_v1.py similarity index 97% rename from mirri/settings.py rename to settings_v1.py index 8f731f6..394be32 100644 --- a/mirri/settings.py +++ b/settings_v1.py @@ -1,5 +1,7 @@ from pathlib import Path +from charset_normalizer import VERSION + DATA_DIR = Path(__file__).parent / "data" ACCESSION_NUMBER = "accession_number" @@ -282,15 +284,28 @@ SUBTAXAS = { "f.sp.": "forma.specialis" } + +#Control +VERSION = "Version" +DATE = "Date" + + +#Controle files +CONTROL_FIELDS = [ + {"label": "Version", "attribute": VERSION}, + {"label": "Date", "attribute": DATE}, +] + # Excel sheet name LOCATIONS = "Geographic origin" # 'Locations' GROWTH_MEDIA = "Growth media" GENOMIC_INFO = "Genomic information" STRAINS = "Strains" LITERATURE_SHEET = "Literature" -SEXUAL_STATE_SHEET = "Sexual states" +SEXUAL_STATE_SHEET = "Sexual state" RESOURCE_TYPES_VALUES = "Resource types values" FORM_OF_SUPPLY_SHEET = "Forms of supply" PLOIDY_SHEET = "Ploidy" ONTOBIOTOPE = "Ontobiotope" MARKERS = "Markers" +CONTROL_SHEET = "Version" diff --git a/setup.py b/setup.py deleted file mode 100644 index 316631c..0000000 --- a/setup.py +++ /dev/null @@ -1,35 +0,0 @@ -import setuptools -from pathlib import Path -from setuptools import find_packages - -with open("README.md", "r") as fh: - long_description = fh.read() - -requirements = [line.strip() for line in open('requirements.txt')] -scripts = [str(f) for f in Path('./bin').glob('*.py')] - -setuptools.setup( - name="Mirri utils", # Replace with your own username - version=0.1, - author="P.Ziarsolo", - author_email="pziarsolo@gmail.com", - description="A small library to help dealing with MIRRI data", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/pziarsolo/mirri_utils", - packages=find_packages(), - package_data={"mirri": ['data/ontobiotopes.csv']}, - # package_dir={"mirri.entities": "mirri.entities" - # "mirri.io.parsers": "mirri.io.parsers", - # "mirri.io.writers": "mirri.io.writers", - # 'mirri.validation': 'mirri.vallidation'}, - install_requires=requirements, - scripts=scripts, - license="GNU General Public License v3.0", - classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - python_requires='>=3.6', -) diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/biolomics/__init__.py b/tests/biolomics/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/biolomics/test_auth_operations.py b/tests/biolomics/test_auth_operations.py deleted file mode 100644 index ec21098..0000000 --- a/tests/biolomics/test_auth_operations.py +++ /dev/null @@ -1,22 +0,0 @@ -import unittest - -from mirri.biolomics.remote.rest_client import BiolomicsClient -try: - from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -except ImportError: - raise ImportError( - 'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD') - -from .utils import VERSION, SERVER_URL - - -class BiolomicsClientAuthTest(unittest.TestCase): - - def test_authentication(self): - client = BiolomicsClient(SERVER_URL, VERSION, CLIENT_ID, SECRET_ID, - USERNAME, PASSWORD) - access1 = client.get_access_token() - access2 = client.get_access_token() - assert access1 is not None - self.assertEqual(access1, access2) - diff --git a/tests/biolomics/test_growth_medium_operations.py b/tests/biolomics/test_growth_medium_operations.py deleted file mode 100644 index 12de1a6..0000000 --- a/tests/biolomics/test_growth_medium_operations.py +++ /dev/null @@ -1,62 +0,0 @@ -import unittest - -from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS -from mirri.biolomics.serializers.growth_media import GrowthMedium -from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from tests.biolomics.utils import SERVER_URL, VERSION - - -class BiolomicsSequenceClientTest(unittest.TestCase): - def setUp(self): - self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - - def test_retrieve_media_by_id(self): - record_id = 101 - growth_medium = self.client.retrieve_by_id('growth_medium', record_id) - self.assertEqual(growth_medium.record_id, record_id) - - self.assertEqual(growth_medium.record_name, 'MA2PH6') - - def test_retrieve_media_by_id(self): - record_name = 'MA2PH6' - record_id = 101 - growth_medium = self.client.retrieve_by_name('growth_medium', record_name) - self.assertEqual(growth_medium.record_id, record_id) - self.assertEqual(growth_medium.record_name, record_name) - - def test_create_growth_media(self): - self.client.start_transaction() - try: - growth_medium = GrowthMedium() - growth_medium.acronym = 'BBB' - growth_medium.ingredients = 'alkhdflakhf' - growth_medium.description = 'desc' - - new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium) - print(new_growth_medium.dict()) - finally: - self.client.rollback() - - def test_update_growth_media(self): - self.client.start_transaction() - try: - growth_medium = GrowthMedium() - growth_medium.acronym = 'BBB' - growth_medium.ingredients = 'alkhdflakhf' - growth_medium.description = 'desc' - growth_medium.full_description = 'full' - new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium) - - new_growth_medium.full_description = 'full2' - updated_gm = new_growth_medium = self.client.update(GROWTH_MEDIUM_WS, new_growth_medium) - self.assertEqual(updated_gm.full_description, new_growth_medium.full_description) - - retrieved = self.client.retrieve_by_id(GROWTH_MEDIUM_WS, new_growth_medium.record_id) - self.assertEqual(retrieved.full_description, updated_gm.full_description) - - finally: - self.client.rollback() - - diff --git a/tests/biolomics/test_literature_operations.py b/tests/biolomics/test_literature_operations.py deleted file mode 100644 index 196d17d..0000000 --- a/tests/biolomics/test_literature_operations.py +++ /dev/null @@ -1,46 +0,0 @@ -import unittest - -from .utils import VERSION, SERVER_URL -from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS -from mirri.entities.publication import Publication - - -class BiolomicsLiteratureClientTest(unittest.TestCase): - def setUp(self): - self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - - def test_retrieve_biblio_by_id(self): - record_id = 100 - record_name = "Miscellaneous notes on Mucoraceae" - biblio = self.client.retrieve_by_id(BIBLIOGRAPHY_WS, record_id) - self.assertEqual(biblio.record_id, record_id) - - self.assertEqual(biblio.record_name, record_name) - - def test_retrieve_media_by_id(self): - record_id = 100 - record_name = "Miscellaneous notes on Mucoraceae" - biblio = self.client.retrieve_by_name(BIBLIOGRAPHY_WS, record_name) - self.assertEqual(biblio.record_id, record_id) - self.assertEqual(biblio.record_name, record_name) - self.assertEqual(biblio.year, 1994) - self.assertEqual(biblio.volume, '50') - - def test_create_biblio(self): - pub = Publication() - pub.pubmed_id = 'PM18192' - pub.journal = 'my_journal' - pub.title = 'awesome title' - pub.authors = 'pasdas, aposjdasd, alsalsfda' - pub.volume = 'volume 0' - record_id = None - try: - new_pub = self.client.create(BIBLIOGRAPHY_WS, pub) - record_id = new_pub.record_id - self.assertEqual(new_pub.title, pub.title) - self.assertEqual(new_pub.volume, pub.volume) - finally: - if record_id is not None: - self.client.delete_by_id(BIBLIOGRAPHY_WS, record_id) diff --git a/tests/biolomics/test_sequence_operations.py b/tests/biolomics/test_sequence_operations.py deleted file mode 100644 index ddf8864..0000000 --- a/tests/biolomics/test_sequence_operations.py +++ /dev/null @@ -1,49 +0,0 @@ -import unittest - -from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics -from .utils import VERSION, SERVER_URL - - -class BiolomicsSequenceClientTest(unittest.TestCase): - def setUp(self) -> None: - self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - - def test_retrieve_seq_by_id(self): - record_id = 101 - sequence = self.client.retrieve_by_id('sequence', record_id) - - self.assertEqual(sequence.record_id, record_id) - self.assertEqual(sequence.record_name, 'MUM 02.54 - CaM') - self.assertEqual(sequence.marker_type, 'CaM') - - def test_retrieve_seq_by_name(self): - record_name = 'MUM 02.54 - CaM' - sequence = self.client.retrieve_by_name('sequence', record_name) - - self.assertEqual(sequence.record_id, 101) - self.assertEqual(sequence.record_name, record_name) - self.assertEqual(sequence.marker_type, 'CaM') - - def test_create_delete_sequence(self): - marker = GenomicSequenceBiolomics() - marker.marker_id = 'GGAAUUA' - marker.marker_seq = 'aattgacgat' - marker.marker_type = 'CaM' - marker.record_name = 'peioMarker' - - new_marker = self.client.create('sequence', marker) - self.assertEqual(new_marker.marker_id, 'GGAAUUA') - self.assertEqual(new_marker.marker_seq, 'aattgacgat') - self.assertEqual(new_marker.marker_type, 'CaM') - self.assertEqual(new_marker.record_name, 'peioMarker') - self.assertTrue(new_marker.record_id) - - self.client.delete_by_id('sequence', new_marker.record_id) - - -if __name__ == "__main__": - # import sys;sys.argv = ['', 'BiolomicsClient.Test.test_get_strain_by_id'] - unittest.main() diff --git a/tests/biolomics/test_serializers.py b/tests/biolomics/test_serializers.py deleted file mode 100644 index e57d1f3..0000000 --- a/tests/biolomics/test_serializers.py +++ /dev/null @@ -1,727 +0,0 @@ -import unittest -import pycountry -import deepdiff -from pprint import pprint -from mirri.biolomics.serializers.sequence import ( - GenomicSequenceBiolomics, - serialize_to_biolomics as sequence_to_biolomics, - serialize_from_biolomics as sequence_from_biolomics) - -from mirri.biolomics.serializers.strain import ( - serialize_to_biolomics as strain_to_biolomics, - serialize_from_biolomics as strain_from_biolomics) -from mirri.biolomics.serializers.growth_media import ( - # serialize_to_biolomics as growth_medium_to_biolomics, - serialize_from_biolomics as growth_medium_from_biolomics) -from mirri.biolomics.serializers.bibliography import ( - serializer_from_biolomics as literature_from_biolomics, - serializer_to_biolomics as literature_to_biolomics -) -from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.entities.publication import Publication -from .utils import create_full_data_strain, VERSION, SERVER_URL - - -STRAIN_WS = { - 'CreationDate': '2021-05-19T12:22:33', - 'CreatorUserName': 'pziarsolo@cect.org', - 'LastChangeDate': '2021-05-19T12:22:36', - 'LastChangeUserName': 'pziarsolo@cect.org', - 'RecordDetails': {'ABS related files': {'FieldType': 21, - 'Value': [{'Name': 'link', - 'Value': 'https://example.com'}]}, - 'Altitude of geographic origin': {'FieldType': 4, - 'Value': 121.0}, - 'Applications': {'FieldType': 5, 'Value': 'health'}, - 'Catalog URL': {'FieldType': 21, 'Value': []}, - 'Collection accession number': {'FieldType': 5, - 'Value': 'TESTCC 1'}, - 'Collection date': {'FieldType': 8, 'Value': '1991/01/01'}, - 'Collector': {'FieldType': 5, 'Value': 'the collector'}, - 'Comment on taxonomy': {'FieldType': 5, - 'Value': 'lalalalla'}, - 'Coordinates of geographic origin': {'FieldType': 12, - 'Value': {'Altitude': 0.0, - 'Latitude': 23.3, - 'Longitude': 23.3, - 'Precision': 0.0}}, - 'Country': {'FieldType': 118, - 'Value': [{'Name': {'FieldType': 5, - 'Value': 'Spain'}, - 'RecordId': 54, - 'TargetFieldValue': None}]}, - 'Data provided by': {'FieldType': 22, 'Value': 'Unknown'}, - 'Date of inclusion in the catalogue': {'FieldType': 8, - 'Value': '1985/05/02'}, - 'Deposit date': {'FieldType': 8, 'Value': '1985/05/02'}, - 'Depositor': {'FieldType': 5, - 'Value': 'NCTC, National Collection of Type ' - 'Cultures - NCTC, London, United ' - 'Kingdom of Great Britain and ' - 'Northern Ireland.'}, - 'Dual use': {'FieldType': 20, 'Value': 'yes'}, - 'Enzyme production': {'FieldType': 5, - 'Value': 'some enzimes'}, - 'Form': {'FieldType': 3, - 'Value': [{'Name': 'Agar', 'Value': 'yes'}, - {'Name': 'Cryo', 'Value': 'no'}, - {'Name': 'Dry Ice', 'Value': 'no'}, - {'Name': 'Liquid Culture Medium', - 'Value': 'no'}, - {'Name': 'Lyo', 'Value': 'yes'}, - {'Name': 'Oil', 'Value': 'no'}, - {'Name': 'Water', 'Value': 'no'}]}, - 'GMO': {'FieldType': 22, 'Value': 'Yes'}, - 'GMO construction information': {'FieldType': 5, - 'Value': 'instructrion to ' - 'build'}, - 'Genotype': {'FieldType': 5, 'Value': 'some genotupe'}, - 'Geographic origin': {'FieldType': 5, - 'Value': 'una state; one ' - 'municipality; somewhere in ' - 'the world'}, - 'History': {'FieldType': 5, - 'Value': 'newer < In the middle < older'}, - 'Infrasubspecific names': {'FieldType': 5, - 'Value': 'serovar tete'}, - 'Interspecific hybrid': {'FieldType': 20, 'Value': 'no'}, - 'Isolation date': {'FieldType': 8, 'Value': '1900/01/01'}, - 'Isolation habitat': {'FieldType': 5, - 'Value': 'some habitat'}, - 'Isolator': {'FieldType': 5, 'Value': 'the isolator'}, - 'Literature': {'FieldType': 118, 'Value': []}, - 'MTA files URL': {'FieldType': 21, - 'Value': [{'Name': 'link', - 'Value': 'https://example.com'}]}, - 'MTA text': {'FieldType': 5, 'Value': ''}, - 'Metabolites production': {'FieldType': 5, - 'Value': 'big factory of cheese'}, - 'Mutant information': {'FieldType': 5, 'Value': 'x-men'}, - 'Nagoya protocol restrictions and compliance conditions': {'FieldType': 20, - 'Value': 'no ' - 'known ' - 'restrictions ' - 'under ' - 'the ' - 'Nagoya ' - 'protocol'}, - 'Ontobiotope': {'FieldType': 118, - 'Value': [{'Name': {'FieldType': 5, - 'Value': 'anaerobic ' - 'bioreactor ' - '(OBT:000190)'}, - 'RecordId': 100, - 'TargetFieldValue': None}]}, - 'Ontobiotope term for the isolation habitat': {'FieldType': 5, - 'Value': ''}, - 'Orders': {'FieldType': 118, 'Value': []}, - 'Organism type': {'FieldType': 3, - 'Value': [{'Name': 'Algae', 'Value': 'no'}, - {'Name': 'Archaea', - 'Value': 'yes'}, - {'Name': 'Bacteria', - 'Value': 'no'}, - {'Name': 'Cyanobacteria', - 'Value': 'no'}, - {'Name': 'Filamentous Fungi', - 'Value': 'no'}, - {'Name': 'Phage', 'Value': 'no'}, - {'Name': 'Plasmid', - 'Value': 'no'}, - {'Name': 'Virus', 'Value': 'no'}, - {'Name': 'Yeast', 'Value': 'no'}, - {'Name': 'Microalgae', - 'Value': '?'}]}, - 'Other culture collection numbers': {'FieldType': 5, - 'Value': 'aaa a; aaa3 ' - 'a3'}, - 'Other denomination': {'FieldType': 5, 'Value': ''}, - 'Pathogenicity': {'FieldType': 5, 'Value': 'illness'}, - 'Plasmids': {'FieldType': 5, 'Value': 'asda'}, - 'Plasmids collections fields': {'FieldType': 5, - 'Value': 'asdasda'}, - 'Ploidy': {'FieldType': 20, 'Value': 'Polyploid'}, - 'Quarantine in Europe': {'FieldType': 20, 'Value': 'no'}, - 'Recommended growth medium': {'FieldType': 118, - 'Value': [{'Name': {'FieldType': 5, - 'Value': 'AAA'}, - 'RecordId': 1, - 'TargetFieldValue': None}]}, - 'Recommended growth temperature': {'FieldType': 19, - 'MaxValue': 30.0, - 'MinValue': 30.0}, - 'Remarks': {'FieldType': 5, 'Value': 'no remarks for me'}, - 'Restrictions on use': {'FieldType': 20, - 'Value': 'no restriction apply'}, - 'Risk group': {'FieldType': 20, 'Value': '1'}, - 'Sequences 16s': {"Value": [ - { - "Name": { - "Value": "X76436", - "FieldType": 5 - }, - "RecordId": 50992, - "TargetFieldValue": { - "Value": { - "Sequence": "" - }, - "FieldType": 14 - } - } - ], - "FieldType": 114}, - 'Sequences 18S rRNA': {'FieldType': 114, 'Value': []}, - 'Sequences 23S rRNA': {'FieldType': 114, 'Value': []}, - 'Sequences ACT': {'FieldType': 114, 'Value': []}, - 'Sequences AmdS': {'FieldType': 114, 'Value': []}, - 'Sequences Amds12': {'FieldType': 114, 'Value': []}, - 'Sequences Beta tubulin': {'FieldType': 114, 'Value': []}, - 'Sequences COX1': {'FieldType': 114, 'Value': []}, - 'Sequences COX2': {'FieldType': 114, 'Value': []}, - 'Sequences CaM': {'FieldType': 114, 'Value': []}, - 'Sequences Cct8': {'FieldType': 114, 'Value': []}, - 'Sequences Cit1': {'FieldType': 114, 'Value': []}, - 'Sequences CypA': {'FieldType': 114, 'Value': []}, - 'Sequences GDP': {'FieldType': 114, 'Value': []}, - 'Sequences GPD': {'FieldType': 114, 'Value': []}, - 'Sequences Genome': {'FieldType': 114, 'Value': []}, - 'Sequences HIS': {'FieldType': 114, 'Value': []}, - 'Sequences HSP': {'FieldType': 114, 'Value': []}, - 'Sequences IDH': {'FieldType': 114, 'Value': []}, - 'Sequences IGS': {'FieldType': 114, 'Value': []}, - 'Sequences ITS': {'FieldType': 114, 'Value': []}, - 'Sequences LSU': {'FieldType': 114, 'Value': []}, - 'Sequences MAT': {'FieldType': 114, 'Value': []}, - 'Sequences MAT1': {'FieldType': 114, 'Value': []}, - 'Sequences Miscellaneous': {'FieldType': 114, 'Value': []}, - 'Sequences NorA': {'FieldType': 114, 'Value': []}, - 'Sequences NorB': {'FieldType': 114, 'Value': []}, - 'Sequences Omt12': {'FieldType': 114, 'Value': []}, - 'Sequences OmtA': {'FieldType': 114, 'Value': []}, - 'Sequences PcCYP': {'FieldType': 114, 'Value': []}, - 'Sequences PpgA': {'FieldType': 114, 'Value': []}, - 'Sequences PreA': {'FieldType': 114, 'Value': []}, - 'Sequences PreB': {'FieldType': 114, 'Value': []}, - 'Sequences RAPD': {'FieldType': 114, 'Value': []}, - 'Sequences RPB1': {'FieldType': 114, 'Value': []}, - 'Sequences RPB2': {'FieldType': 114, 'Value': []}, - 'Sequences SSU': {'FieldType': 114, 'Value': []}, - 'Sequences TEF1a': {'FieldType': 114, 'Value': []}, - 'Sequences TEF2': {'FieldType': 114, 'Value': []}, - 'Sequences TUB': {'FieldType': 114, 'Value': []}, - 'Sequences Tsr1': {'FieldType': 114, 'Value': []}, - 'Sequences c16S rRNA': {'FieldType': 114, 'Value': []}, - 'Sequences cbhI': {'FieldType': 114, 'Value': []}, - 'Sequences mcm7': {'FieldType': 114, 'Value': []}, - 'Sequences rbcL': {'FieldType': 114, 'Value': []}, - 'Sexual state': {'FieldType': 5, 'Value': 'MT+A'}, - 'Status': {'FieldType': 5, - 'Value': 'type of Bacillus alcalophilus'}, - 'Strain from a registered collection': {'FieldType': 20, - 'Value': 'no'}, - 'Substrate of isolation': {'FieldType': 5, - 'Value': 'some substrate'}, - 'Taxon name': {'FieldType': 109, - 'Value': [{'Name': {'FieldType': 5, - 'Value': 'Escherichia ' - 'coli'}, - 'RecordId': 100004123, - 'TargetFieldValue': {'DesktopInfo': None, - 'DesktopInfoHtml': 'Current ' - 'name: ' - 'Escherichia ' - 'coli ' - '(Migula ' - '1895) ' - 'Castellani ' - 'and ' - 'Chalmers ' - '1919', - 'FieldType': 27, - 'NewSynFieldInfo': None, - 'ObligateSynonymId': 0, - 'OriginalSynFieldInfo': None, - 'SynInfo': {'BasionymRecord': {'NameInfo': '', - 'RecordId': 100004123, - 'RecordName': 'Escherichia ' - 'coli ' - '(Migula ' - '1895) ' - 'Castellani ' - 'and ' - 'Chalmers ' - '1919', - 'SecondLevelRecords': None}, - 'CurrentNameRecord': {'NameInfo': '', - 'RecordId': 100004123, - 'RecordName': 'Escherichia ' - 'coli ' - '(Migula ' - '1895) ' - 'Castellani ' - 'and ' - 'Chalmers ' - '1919', - 'SecondLevelRecords': None}, - 'ObligateSynonymRecords': [], - 'SelectedRecord': { - 'NameInfo': 'Escherichia ' - 'coli ' - '(Migula ' - '1895) ' - 'Castellani ' - 'and ' - 'Chalmers ' - '1919', - 'RecordId': 100004123, - 'RecordName': 'Escherichia ' - 'coli ' - '(Migula ' - '1895) ' - 'Castellani ' - 'and ' - 'Chalmers ' - '1919', - 'SecondLevelRecords': None}, - 'TaxonSynonymsRecords': []}, - 'SynonymId': 100004123}}]}, - 'Tested temperature growth range': {'FieldType': 19, - 'MaxValue': 32.0, - 'MinValue': 29.0}, - 'Type description': {'FieldType': 5, 'Value': ''}}, - 'RecordId': 148038, - 'RecordName': 'MIRRI 2240561'} - -STRAIN_WS_EXPECTED_NO_REMOTE = { - 'Acronym': 'MIRRI', - 'RecordDetails': {'ABS related files': {'FieldType': 'U', - 'Value': [{'Name': 'link', - 'Value': 'https://example.com'}]}, - 'Altitude of geographic origin': {'FieldType': 'D', - 'Value': 121}, - 'Applications': {'FieldType': 'E', 'Value': 'health'}, - 'Collection accession number': {'FieldType': 'E', - 'Value': 'TESTCC 1'}, - 'Collection date': {'FieldType': 'H', 'Value': '1991-01-01'}, - 'Collector': {'FieldType': 'E', 'Value': 'the collector'}, - 'Comment on taxonomy': {'FieldType': 'E', - 'Value': 'lalalalla'}, - 'Coordinates of geographic origin': {'FieldType': 'L', - 'Value': {'Latitude': 23.3, - 'Longitude': 23.3}}, - 'Date of inclusion in the catalogue': {'FieldType': 'H', - 'Value': '1985-05-02'}, - 'Deposit date': {'FieldType': 'H', 'Value': '1985-05-02'}, - 'Depositor': {'FieldType': 'E', - 'Value': 'NCTC, National Collection of Type ' - 'Cultures - NCTC, London, United ' - 'Kingdom of Great Britain and ' - 'Northern Ireland.'}, - 'Dual use': {'FieldType': 'T', 'Value': 'yes'}, - 'Enzyme production': {'FieldType': 'E', - 'Value': 'some enzimes'}, - 'Form': {'FieldType': 'C', - 'Value': [{'Name': 'Agar', 'Value': 'yes'}, - {'Name': 'Cryo', 'Value': 'no'}, - {'Name': 'Dry Ice', 'Value': 'no'}, - {'Name': 'Liquid Culture Medium', - 'Value': 'no'}, - {'Name': 'Lyo', 'Value': 'yes'}, - {'Name': 'Oil', 'Value': 'no'}, - {'Name': 'Water', 'Value': 'no'}]}, - 'GMO': {'FieldType': 'V', 'Value': 'Yes'}, - 'GMO construction information': {'FieldType': 'E', - 'Value': 'instructrion to ' - 'build'}, - 'Genotype': {'FieldType': 'E', 'Value': 'some genotupe'}, - 'Geographic origin': {'FieldType': 'E', - 'Value': 'una state; one ' - 'municipality; somewhere in ' - 'the world'}, - 'History': {'FieldType': 'E', - 'Value': 'firstplave < seconn place < third ' - 'place'}, - 'Infrasubspecific names': {'FieldType': 'E', - 'Value': 'serovar tete'}, - 'Interspecific hybrid': {'FieldType': 'T', 'Value': 'no'}, - 'Isolation date': {'FieldType': 'H', 'Value': '1900-01-01'}, - 'Isolation habitat': {'FieldType': 'E', - 'Value': 'some habitat'}, - 'Isolator': {'FieldType': 'E', 'Value': 'the isolator'}, - 'MTA files URL': {'FieldType': 'U', - 'Value': [{'Name': 'link', - 'Value': 'https://example.com'}]}, - 'Metabolites production': {'FieldType': 'E', - 'Value': 'big factory of cheese'}, - 'Mutant information': {'FieldType': 'E', 'Value': 'x-men'}, - 'Nagoya protocol restrictions and compliance conditions': {'FieldType': 'T', - 'Value': 'no ' - 'known ' - 'restrictions ' - 'under ' - 'the ' - 'Nagoya ' - 'protocol'}, - 'Ontobiotope': {'FieldType': 'RLink', 'Value': 'OBT:000190'}, - 'Organism type': {'FieldType': 'C', - 'Value': [{'Name': 'Algae', 'Value': 'no'}, - {'Name': 'Archaea', - 'Value': 'yes'}, - {'Name': 'Bacteria', - 'Value': 'no'}, - {'Name': 'Cyanobacteria', - 'Value': 'no'}, - {'Name': 'Filamentous Fungi', - 'Value': 'no'}, - {'Name': 'Phage', 'Value': 'no'}, - {'Name': 'Plasmid', - 'Value': 'no'}, - {'Name': 'Virus', 'Value': 'no'}, - {'Name': 'Yeast', - 'Value': 'no'}]}, - 'Other culture collection numbers': {'FieldType': 'E', - 'Value': 'aaa a; aaa3 ' - 'a3'}, - 'Pathogenicity': {'FieldType': 'E', 'Value': 'illness'}, - 'Plasmids': {'FieldType': 'E', 'Value': 'asda'}, - 'Plasmids collections fields': {'FieldType': 'E', - 'Value': 'asdasda'}, - 'Ploidy': {'FieldType': 'T', 'Value': 'Polyploid'}, - 'Quarantine in Europe': {'FieldType': 'T', 'Value': 'no'}, - 'Recommended growth temperature': {'FieldType': 'S', - 'MaxValue': 30.0, - 'MinValue': 30.0}, - 'Remarks': {'FieldType': 'E', 'Value': 'no remarks for me'}, - 'Restrictions on use': {'FieldType': 'T', - 'Value': 'no restriction apply'}, - 'Risk group': {'FieldType': 'T', 'Value': '1'}, - 'Sexual state': {'FieldType': 'E', 'Value': 'MT+A'}, - 'Status': {'FieldType': 'E', - 'Value': 'type of Bacillus alcalophilus'}, - 'Strain from a registered collection': {'FieldType': 'T', - 'Value': 'no'}, - 'Substrate of isolation': {'FieldType': 'E', - 'Value': 'some substrate'}, - 'Taxon name': {'FieldType': 'SynLink', - 'Value': 'Escherichia coli'}, - 'Tested temperature growth range': {'FieldType': 'S', - 'MaxValue': 32.0, - 'MinValue': 29.0}}} - - -class StrainSerializerTest(unittest.TestCase): - - def test_serialize_to_biolomics(self): - strain = create_full_data_strain() - ws_strain = strain_to_biolomics(strain, client=None) - self.assertDictEqual(ws_strain, STRAIN_WS_EXPECTED_NO_REMOTE) - - def test_serialize_to_biolomics_remote(self): - client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - strain = create_full_data_strain() - marker = GenomicSequenceBiolomics() - marker.marker_id = "MUM 02.15 - Beta tubulin" - marker.marker_type = 'TUBB' - strain.genetics.markers = [marker] - ws_strain = strain_to_biolomics(strain, client=client) - - self.assertEqual(strain.collect.habitat_ontobiotope, - ws_strain['RecordDetails']['Ontobiotope']['Value'][0]['Name']['Value']) - self.assertEqual(pycountry.countries.get(alpha_3=strain.collect.location.country).name, - ws_strain['RecordDetails']['Country']['Value'][0]['Name']['Value']) - self.assertEqual(strain.publications[0].title, - ws_strain['RecordDetails']['Literature']['Value'][0]['Name']['Value']) - self.assertEqual(strain.genetics.markers[0].marker_id, - ws_strain['RecordDetails']['Sequences TUB']['Value'][0]['Name']['Value']) - - def test_serialize_from_biolomics(self): - ws_strain = STRAIN_WS - strain = strain_from_biolomics(ws_strain) - self.assertEqual(strain.record_id, 148038) - self.assertEqual(strain.record_name, 'MIRRI 2240561') - self.assertEqual(strain.taxonomy.long_name, 'Escherichia coli') - self.assertEqual(strain.growth.recommended_media, ['AAA']) - self.assertEqual(strain.collect.location.altitude, 121) - self.assertEqual(strain.collect.location.country, 'ESP') - self.assertEqual(strain.applications, 'health') - self.assertEqual(strain.id.strain_id, 'TESTCC 1') - self.assertEqual(strain.collect.date.strfdate, '19910101') - self.assertEqual(strain.taxonomy.comments, 'lalalalla') - self.assertEqual(strain.catalog_inclusion_date.strfdate, '19850502') - self.assertIn('NCTC, National Collection of Type ', strain.deposit.who) - self.assertTrue(strain.is_potentially_harmful) - self.assertEqual(strain.form_of_supply, ['Agar', 'Lyo']) - self.assertTrue(strain.genetics.gmo) - self.assertEqual(strain.genetics.gmo_construction, 'instructrion to build') - self.assertEqual(strain.genetics.genotype, 'some genotupe') - self.assertEqual(strain.history, ['newer', 'In the middle', 'older']) - self.assertEqual(strain.taxonomy.infrasubspecific_name, 'serovar tete') - self.assertEqual(strain.isolation.who, 'the isolator') - self.assertEqual(strain.isolation.date.strfdate, '19000101') - self.assertEqual(strain.mta_files, ['https://example.com']) - self.assertEqual(strain.genetics.mutant_info, 'x-men') - self.assertEqual(strain.collect.habitat_ontobiotope, 'OBT:000190') - self.assertEqual(strain.taxonomy.organism_type[0].name, 'Archaea') - self.assertEqual(strain.other_numbers[0].strain_id, 'aaa a') - self.assertEqual(strain.other_numbers[1].strain_id, 'aaa3 a3') - self.assertEqual(strain.pathogenicity, 'illness') - self.assertEqual(strain.genetics.plasmids, ['asda']) - self.assertEqual(strain.genetics.ploidy, 9) - self.assertFalse(strain.is_subject_to_quarantine) - self.assertEqual(strain.risk_group, '1') - self.assertFalse(strain.is_from_registered_collection) - self.assertEqual(strain.growth.tested_temp_range, {'min': 29, 'max': 32}) - - -BIOLOMICSSEQ = { - 'RecordDetails': { - 'Barcode level': {'FieldType': 20, 'Value': 'undefined'}, - 'DNA extract number': {'FieldType': 5, 'Value': ''}, - 'DNA sequence': {'FieldType': 14, - 'Value': {'Sequence': 'caaaggaggccttctccctcttcgtaag'}}, - 'Editing state': {'FieldType': 20, 'Value': 'Auto import'}, - 'Forward primer(s)': {'FieldType': 5, 'Value': ''}, - 'Genbank': {'FieldType': 21, 'Value': []}, - 'INSDC number': {'FieldType': 5, 'Value': 'AATGAT'}, - 'Literature': {'FieldType': 21, 'Value': []}, - 'Literature1': {'FieldType': 118, 'Value': []}, - 'Marker name': {'FieldType': 5, 'Value': 'CaM'}, - 'Privacy': {'FieldType': 20, 'Value': 'undefined'}, - 'Quality': {'FieldType': 5, 'Value': ''}, - 'Remarks': {'FieldType': 5, 'Value': ''}, - 'Reverse primer(s)': {'FieldType': 5, 'Value': ''}, - 'Review state': {'FieldType': 5, 'Value': ''}, - 'Strain number': {'FieldType': 5, 'Value': 'MUM 02.54'}}, - 'RecordId': 101, - 'RecordName': 'MUM 02.54 - CaM'} - - -class SequenceSerializerTest(unittest.TestCase): - - def test_from_biolomics(self): - marker = sequence_from_biolomics(BIOLOMICSSEQ) - self.assertEqual(marker.record_name, BIOLOMICSSEQ['RecordName']) - self.assertEqual(marker.record_id, BIOLOMICSSEQ['RecordId']) - self.assertEqual(marker.marker_type, BIOLOMICSSEQ['RecordDetails']['Marker name']['Value']) - self.assertEqual(marker.marker_id, BIOLOMICSSEQ['RecordDetails']['INSDC number']['Value']) - self.assertEqual(marker.marker_seq, BIOLOMICSSEQ['RecordDetails']['DNA sequence']['Value']['Sequence']) - - def test_to_biolomics(self): - marker = GenomicSequenceBiolomics() - marker.marker_id = 'GGAAUUA' - marker.marker_seq = 'aattgacgat' - marker.marker_type = 'CaM' - marker.record_name = 'peioMarker' - marker.record_id = 111 - ws_seq = sequence_to_biolomics(marker) - expected = {'RecordId': marker.record_id, - 'RecordName': marker.record_name, - 'RecordDetails': { - 'INSDC number': {'Value': marker.marker_id, 'FieldType': 'E'}, - 'DNA sequence': {'Value': {'Sequence': marker.marker_seq}, 'FieldType': 'N'}, - 'Marker name': {'Value': marker.marker_type, 'FieldType': 'E'}}} - - self.assertEqual(ws_seq, expected) - - -BIOLOMICS_MEDIUM = { - "RecordId": 100, - "RecordName": "MA20S", - "RecordDetails": { - "Full description": { - "Value": "mout agar+20% saccharose", - "FieldType": 5 - }, - "Ingredients": { - "Value": "Malt extract\r\n\tDilute brewery malt with water to 10% sugar solution (level 10 on Brix saccharose meter), 15 minutes at 121 C\r\nsaccharose\t200g\r\ndistilled water\t0.6l\r\nagar\t15g\r\n", - "FieldType": 5 - }, - "Link to full description": { - "Value": [], - "FieldType": 21 - }, - "Medium description": { - "Value": "", - "FieldType": 5 - }, - "Other name": { - "Value": "", - "FieldType": 5 - }, - "pH": { - "Value": "7 with KOH", - "FieldType": 5 - }, - "Remarks": { - "Value": "", - "FieldType": 5 - }, - "Reference": { - "Value": "", - "FieldType": 5 - }, - "Sterilization conditions": { - "Value": "15 minutes at 121 C", - "FieldType": 5 - } - } -} - - -class MediumSerializerTest(unittest.TestCase): - def test_from_biolomics(self): - medium = growth_medium_from_biolomics(BIOLOMICS_MEDIUM) - self.assertEqual(medium.record_id, BIOLOMICS_MEDIUM['RecordId']) - self.assertEqual(medium.record_name, BIOLOMICS_MEDIUM['RecordName']) - self.assertEqual(medium.ingredients, BIOLOMICS_MEDIUM['RecordDetails']['Ingredients']['Value']) - self.assertEqual(medium.full_description, BIOLOMICS_MEDIUM['RecordDetails']['Full description']['Value']) - self.assertEqual(medium.ph, BIOLOMICS_MEDIUM['RecordDetails']['pH']['Value']) - - -BIOLOMICS_BIBLIOGRAPHY = { - "RecordId": 100, - "RecordName": "Miscellaneous notes on Mucoraceae", - "RecordDetails": { - "Associated strains": { - "Value": [], - "FieldType": 118 - }, - "Associated taxa": { - "Value": [], - "FieldType": 118 - }, - "Authors": { - "Value": "Schipper, M.A.A.; Samson, R.A.", - "FieldType": 5 - }, - "Associated sequences": { - "Value": [], - "FieldType": 118 - }, - "Abstract": { - "Value": "", - "FieldType": 5 - }, - "Collection": { - "Value": "", - "FieldType": 5 - }, - "DOI number": { - "Value": "", - "FieldType": 5 - }, - "Editor(s)": { - "Value": "", - "FieldType": 5 - }, - "Full reference": { - "Value": "", - "FieldType": 5 - }, - "Hyperlink": { - "Value": [], - "FieldType": 21 - }, - "ISBN": { - "Value": "", - "FieldType": 5 - }, - "ISSN": { - "Value": "", - "FieldType": 5 - }, - "Issue": { - "Value": "", - "FieldType": 5 - }, - "Journal": { - "Value": "Mycotaxon", - "FieldType": 5 - }, - "Journal-Book": { - "Value": "", - "FieldType": 5 - }, - "Keywords": { - "Value": "", - "FieldType": 5 - }, - "Page from": { - "Value": "475", - "FieldType": 5 - }, - "Page to": { - "Value": "491", - "FieldType": 5 - }, - "Publisher": { - "Value": "", - "FieldType": 5 - }, - "PubMed ID": { - "Value": "", - "FieldType": 5 - }, - "Volume": { - "Value": "50", - "FieldType": 5 - }, - "Year": { - "Value": 1994, - "FieldType": 4 - } - } -} - - -class BibliographySerializerTest(unittest.TestCase): - def test_from_biolomics(self): - pub = literature_from_biolomics(BIOLOMICS_BIBLIOGRAPHY) - self.assertEqual(pub.record_name, "Miscellaneous notes on Mucoraceae") - self.assertEqual(pub.record_id, 100) - self.assertEqual(pub.year, 1994) - self.assertEqual(pub.authors, "Schipper, M.A.A.; Samson, R.A.") - - def test_to_biolomics(self): - pub = Publication() - pub.title = 'My title' - pub.year = 1992 - pub.authors = 'me and myself' - pub.pubmed_id = '1112222' - pub.issue = 'issue' - ws_data = literature_to_biolomics(pub) - expected = { - 'RecordDetails': { - 'Authors': {'FieldType': 'E', 'Value': 'me and myself'}, - 'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}, - 'Issue': {'FieldType': 'E', 'Value': 'issue'}, - 'Year': {'FieldType': 'D', 'Value': 1992}}, - 'RecordName': 'My title'} - self.assertDictEqual(expected, ws_data) - - def test_to_biolomics2(self): - pub = Publication() - pub.pubmed_id = '1112222' - ws_data = literature_to_biolomics(pub) - expected = { - 'RecordDetails': { - 'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}}, - 'RecordName': f'PUBMED:{pub.pubmed_id}'} - self.assertDictEqual(expected, ws_data) - - pub = Publication() - pub.doi = 'doi.er/111/12131' - ws_data = literature_to_biolomics(pub) - expected = { - 'RecordDetails': { - 'DOI number': {'FieldType': 'E', 'Value': pub.doi}}, - 'RecordName': f'DOI:{pub.doi}'} - self.assertDictEqual(expected, ws_data) - - -if __name__ == "__main__": - import sys; - sys.argv = ['', 'BibliographySerializerTest'] - unittest.main() diff --git a/tests/biolomics/test_strain_operations.py b/tests/biolomics/test_strain_operations.py deleted file mode 100644 index 2bf9584..0000000 --- a/tests/biolomics/test_strain_operations.py +++ /dev/null @@ -1,156 +0,0 @@ -import unittest - -from mirri.biolomics.remote.endoint_names import STRAIN_WS -from .utils import VERSION, SERVER_URL, create_full_data_strain -from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD -from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient -from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number - - -class BiolomicsStrainClientTest(unittest.TestCase): - def setUp(self): - self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - - def test_retrieve_strain_by_id(self): - record_id = 14803 - strain = self.client.retrieve_by_id(STRAIN_WS, record_id) - self.assertEqual(strain.record_id, record_id) - print(strain.record_name) - - def test_retrieve_strain_by_name(self): - record_id = 14803 - record_name = 'MIRRI0014803' - strain = self.client.retrieve_by_name(STRAIN_WS, record_name) - self.assertEqual(strain.record_name, record_name) - self.assertEqual(strain.record_id, record_id) - - def test_search_strain(self): - accession_number = "BEA 0014B" - query = {"Query": [{"Index": 0, - "FieldName": "Collection accession number", - "Operation": "TextExactMatch", - "Value": accession_number}], - "Expression": "Q0", - "DisplayStart": 0, - "DisplayLength": 10} - - search_response = self.client.search(STRAIN_WS, query) - - self.assertEqual(search_response['total'], 1) - self.assertEqual(search_response['records'][0].id.strain_id, - accession_number) - - def test_search_strain4(self): - accession_number = "TESTCC 1" - query = {"Query": [{"Index": 0, - "FieldName": "Collection accession number", - "Operation": "TextExactMatch", - "Value": accession_number}], - "Expression": "Q0", - "DisplayStart": 0, - "DisplayLength": 10} - - search_response = self.client.search(STRAIN_WS, query) - for strain in search_response['records']: - print(strain) - self.client.delete_by_id(STRAIN_WS, strain.record_id) - - def test_search_strain_no_found(self): - accession_number = "BEA 0014B_" - query = {"Query": [{"Index": 0, - "FieldName": "Collection accession number", - "Operation": "TextExactMatch", - "Value": accession_number}], - "Expression": "Q0", - "DisplayStart": 0, - "DisplayLength": 10} - - search_response = self.client.search(STRAIN_WS, query) - - self.assertEqual(search_response['total'], 0) - self.assertFalse(search_response['records']) - - def test_create_strain(self): - strain = create_full_data_strain() - strain.taxonomy.interspecific_hybrid = None - record_id = None - try: - new_strain = self.client.create(STRAIN_WS, strain) - record_id = new_strain.record_id - self.assertIsNone(new_strain.taxonomy.interspecific_hybrid) - self.assertEqual(new_strain.growth.recommended_media, ['AAA']) - self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) - finally: - if record_id is not None: - self.client.delete_by_id(STRAIN_WS, record_id) - - def test_update_strain(self): - strain = create_full_data_strain() - record_id = None - try: - new_strain = self.client.create(STRAIN_WS, strain) - record_id = new_strain.record_id - self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) - self.assertFalse(new_strain.taxonomy.interspecific_hybrid) - new_strain.id.number = '2' - new_strain.taxonomy.interspecific_hybrid = None - updated_strain = self.client.update(STRAIN_WS, new_strain) - self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id) - self.assertIsNone(updated_strain.taxonomy.interspecific_hybrid) - - retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id) - self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id) - self.assertIsNone(retrieved_strain.taxonomy.interspecific_hybrid) - finally: - if record_id is not None: - print('deleting') - self.client.delete_by_id(STRAIN_WS, record_id) - - def test_update_strain_pathogenicity(self): - strain = create_full_data_strain() - print(strain.pathogenicity) - record_id = None - try: - new_strain = self.client.create(STRAIN_WS, strain) - record_id = new_strain.record_id - self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) - self.assertEqual(new_strain.pathogenicity, 'illness') - - new_strain.pathogenicity = None - updated_strain = self.client.update(STRAIN_WS, new_strain) - self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id) - self.assertIsNone(updated_strain.pathogenicity) - - retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id) - self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id) - self.assertIsNone(retrieved_strain.pathogenicity) - finally: - if record_id is not None: - self.client.delete_by_id(STRAIN_WS, record_id) - - def test_search_by_accession_number(self): - accession_number = "BEA 0014B" - strain = retrieve_strain_by_accession_number(self.client, accession_number) - self.assertEqual(strain.id.strain_id, accession_number) - - def test_search_by_accession_number(self): - accession_number = "BEA 0014B_" - strain = retrieve_strain_by_accession_number(self.client, accession_number) - self.assertFalse(strain) - - -class BiolomicsClientGrowthMediaTest(unittest.TestCase): - def setUp(self): - self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, - SECRET_ID, USERNAME, PASSWORD) - - def xtest_growth_media_by_name(self): - gm = self.client.retrieve('growth_media', 'AAA') - self.assertEqual(gm['Record Id'], 1) - - -if __name__ == "__main__": - # import sys;sys.argv = ['', - # 'BiolomicsWriter.test_mirri_excel_parser_invalid'] - unittest.main() diff --git a/tests/biolomics/utils.py b/tests/biolomics/utils.py deleted file mode 100644 index 0674577..0000000 --- a/tests/biolomics/utils.py +++ /dev/null @@ -1,99 +0,0 @@ -from mirri.biolomics.serializers.strain import StrainMirri -from mirri.entities.strain import StrainId, OrganismType -from mirri.entities.sequence import GenomicSequence -from mirri.entities.date_range import DateRange -from mirri.entities.publication import Publication -from mirri.settings import NAGOYA_NO_RESTRICTIONS - -VERSION = 'v2' -SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' - - -def create_full_data_strain(): - strain = StrainMirri() - - strain.id.number = "1" - strain.id.collection = "TESTCC" - strain.id.url = "https://cect/2342" - - strain.restriction_on_use = "no_restriction" - strain.nagoya_protocol = NAGOYA_NO_RESTRICTIONS - strain.abs_related_files = ['https://example.com'] - strain.mta_files = ['https://example.com'] - strain.other_numbers.append(StrainId(collection="aaa", number="a")) - strain.other_numbers.append(StrainId(collection="aaa3", number="a3")) - strain.is_from_registered_collection = False - strain.risk_group = '1' - strain.is_potentially_harmful = True - strain.is_subject_to_quarantine = False - - strain.taxonomy.organism_type = [OrganismType(2)] - strain.taxonomy.genus = 'Escherichia' - strain.taxonomy.species = 'coli' - strain.taxonomy.interspecific_hybrid = False - strain.taxonomy.infrasubspecific_name = 'serovar tete' - strain.taxonomy.comments = 'lalalalla' - - strain.status = "type of Bacillus alcalophilus" - strain.history = 'firstplave < seconn place < third place' - - strain.deposit.who = "NCTC, National Collection of Type Cultures - NCTC, London, United Kingdom of Great Britain and Northern Ireland." - strain.deposit.date = DateRange(year=1985, month=5, day=2) - strain.catalog_inclusion_date = DateRange(year=1985, month=5, day=2) - - strain.collect.location.country = "ESP" - strain.collect.location.state = "una state" - strain.collect.location.municipality = "one municipality" - strain.collect.location.longitude = 23.3 - strain.collect.location.latitude = 23.3 - strain.collect.location.altitude = 121 - strain.collect.location.site = "somewhere in the world" - strain.collect.habitat_ontobiotope = "OBT:000190" - strain.collect.habitat = 'some habitat' - strain.collect.who = "the collector" - strain.collect.date = DateRange(year=1991) - - strain.isolation.date = DateRange(year=1900) - strain.isolation.who = 'the isolator' - strain.isolation.substrate_host_of_isolation = 'some substrate' - - # already existing media in test_mirri - - strain.growth.recommended_temp = {'min': 30, 'max': 30} - strain.growth.recommended_media = ["AAA"] - strain.growth.tested_temp_range = {'min': 29, 'max': 32} - - strain.form_of_supply = ["Agar", "Lyo"] - - #strain.other_denominations = ["lajdflasjdldj"] - - gen_seq = GenomicSequence() - gen_seq.marker_id = "pepe" - gen_seq.marker_type = "16S rRNA" - strain.genetics.markers.append(gen_seq) - strain.genetics.ploidy = 9 - strain.genetics.genotype = 'some genotupe' - strain.genetics.gmo = True - strain.genetics.gmo_construction = 'instructrion to build' - strain.genetics.mutant_info = 'x-men' - strain.genetics.sexual_state = 'MT+A' - strain.genetics.plasmids = ['asda'] - strain.genetics.plasmids_in_collections = ['asdasda'] - - pub = Publication() - pub.title = "The genus Amylomyces" - strain.publications = [pub] - - strain.plant_pathogenicity_code = 'PATH:001' - strain.pathogenicity = 'illness' - strain.enzyme_production = 'some enzimes' - strain.production_of_metabolites = 'big factory of cheese' - strain.applications = 'health' - - strain.remarks = 'no remarks for me' - return strain - - -if __name__ == '__main__': - strain = create_full_data_strain() - print(strain.collect.habitat_ontobiotope) diff --git a/tests/data/invalid_content.mirri.xlsx b/tests/data/invalid_content.mirri.xlsx deleted file mode 100644 index 9b2d8c9..0000000 Binary files a/tests/data/invalid_content.mirri.xlsx and /dev/null differ diff --git a/tests/data/invalid_excel.mirri.json b/tests/data/invalid_excel.mirri.json deleted file mode 100644 index e7cf986..0000000 --- a/tests/data/invalid_excel.mirri.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "key1": "value1", - "key2": "value2", - "key3": "value3" -} \ No newline at end of file diff --git a/tests/data/invalid_structure.mirri.xlsx b/tests/data/invalid_structure.mirri.xlsx deleted file mode 100644 index 274fec1..0000000 Binary files a/tests/data/invalid_structure.mirri.xlsx and /dev/null differ diff --git a/tests/data/valid.mirri.full.xlsx b/tests/data/valid.mirri.full.xlsx deleted file mode 100644 index e141c19..0000000 Binary files a/tests/data/valid.mirri.full.xlsx and /dev/null differ diff --git a/tests/data/valid.mirri.xlsx b/tests/data/valid.mirri.xlsx deleted file mode 100644 index 9685a80..0000000 Binary files a/tests/data/valid.mirri.xlsx and /dev/null differ diff --git a/tests/test_entities.py b/tests/test_entities.py deleted file mode 100644 index 13db9ca..0000000 --- a/tests/test_entities.py +++ /dev/null @@ -1,318 +0,0 @@ -""" -Created on 2020(e)ko abe. 2(a) - -@author: peio -""" - -import unittest - -from mirri.entities.publication import Publication -from mirri.entities.date_range import DateRange -from mirri.entities.location import Location -from mirri.entities.sequence import GenomicSequence -from mirri.entities.strain import ( - Collect, - Deposit, - Isolation, - ValidationError, - OrganismType, - Strain, - StrainId, - Taxonomy, -) -from mirri.settings import ( - COLLECT, - COUNTRY, - DATE_OF_ISOLATION, - DEPOSIT, - DEPOSITOR, - GENETICS, - GROWTH, - ISOLATED_BY, - ISOLATION, - LOCATION, - MARKERS, - NAGOYA_DOCS_AVAILABLE, - NAGOYA_PROTOCOL, - ORGANISM_TYPE, - OTHER_CULTURE_NUMBERS, - PLOIDY, - RECOMMENDED_GROWTH_MEDIUM, - TAXONOMY, - DATE_OF_INCLUSION, NO_RESTRICTION -) -from mirri.validation.entity_validators import validate_strain - - -class TestDataRange(unittest.TestCase): - def test_data_range_init(self): - dr = DateRange() - - self.assertFalse(dr) - - self.assertEqual(dr.__str__(), "") - self.assertEqual(dr.range["start"], None) - self.assertEqual(dr.range["end"], None) - - dr.strpdate("2012") - self.assertEqual(dr.strfdate, "2012----") - self.assertTrue(dr) - - dr.strpdate("2012----") - self.assertEqual(dr.strfdate, "2012----") - - dr.strpdate("201212--") - self.assertEqual(dr.strfdate, "201212--") - try: - dr.strpdate("201213--") - self.fail() - except ValueError: - pass - - try: - dr = DateRange(year=2012, month=13) - self.fail() - except ValueError: - pass - - dr = DateRange(year=2020) - self.assertEqual(dr.strfdate, "2020----") - - dr2 = dr.strpdate("2012") - self.assertEqual(dr2.range["start"].year, 2012) - self.assertEqual(dr2.range["start"].month, 1) - self.assertEqual(dr2.range["start"].day, 1) - - self.assertEqual(dr2.range["end"].year, 2012) - self.assertEqual(dr2.range["end"].month, 12) - self.assertEqual(dr2.range["end"].day, 31) - - -class TestCollect(unittest.TestCase): - def test_collect_basic(self): - collect = Collect() - self.assertEqual(collect.dict(), {}) - - collect.location.country = "ESP" - collect.date = DateRange().strpdate("2012----") - - collect.who = "pepito" - self.assertEqual( - dict(collect.dict()), - { - "location": {"countryOfOriginCode": "ESP"}, - "collected_by": "pepito", - "date_of_collection": "2012----", - }, - ) - self.assertEqual(collect.__str__(), - "Collected: Spain in 2012---- by pepito") - - -class TestOrganismType(unittest.TestCase): - def test_basic_usage(self): - org_type = OrganismType(2) - self.assertEqual(org_type.name, "Archaea") - self.assertEqual(org_type.code, 2) - try: - org_type.ko = 'a' - self.fail() - except TypeError: - pass - - org_type = OrganismType("Archaea") - - -class TestTaxonomy(unittest.TestCase): - def test_taxonomy_basic(self): - taxonomy = Taxonomy() - self.assertEqual(taxonomy.dict(), {}) - self.assertFalse(taxonomy) - - def test_taxonomy_with_data(self): - taxonomy = Taxonomy() - taxonomy.genus = "Bacilus" - taxonomy.organism_type = [OrganismType("Archaea")] - taxonomy.species = "vulgaris" - self.assertEqual(taxonomy.long_name, "Bacilus vulgaris") - - # print(taxonomy.dict()) - - -class TestLocation(unittest.TestCase): - def test_empty_init(self): - loc = Location() - self.assertEqual(loc.dict(), {}) - self.assertFalse(loc) - - def test_add_data(self): - loc = Location() - loc.country = "esp" - self.assertEqual(loc.dict(), {COUNTRY: "esp"}) - loc.state = None - self.assertEqual(loc.dict(), {COUNTRY: "esp"}) - - -class TestStrain(unittest.TestCase): - def test_empty_strain(self): - strain = Strain() - self.assertEqual(strain.dict(), {}) - - def test_strain_add_data(self): - strain = Strain() - - strain.id.number = "5433" - strain.id.collection = "CECT" - strain.id.url = "https://cect/2342" - - try: - strain.nagoya_protocol = "asdas" - self.fail() - except ValidationError: - pass - - strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE - strain.dict()[NAGOYA_PROTOCOL] = NAGOYA_DOCS_AVAILABLE - - strain.collect.location.country = "ESP" - - self.assertEqual(strain.dict()[COLLECT][LOCATION][COUNTRY], "ESP") - - strain.genetics.ploidy = 9 - self.assertEqual(strain.dict()[GENETICS][PLOIDY], 9) - - strain.growth.recommended_media = ["asd"] - strain.isolation.date = DateRange(year=1900) - self.assertEqual(strain.dict()[ISOLATION] - [DATE_OF_ISOLATION], "1900----") - - strain.deposit.who = "pepe" - self.assertEqual(strain.dict()[DEPOSIT][DEPOSITOR], "pepe") - - strain.growth.recommended_media = ["11"] - self.assertEqual(strain.dict()[GROWTH] - [RECOMMENDED_GROWTH_MEDIUM], ["11"]) - - strain.taxonomy.organism_type = [OrganismType(2)] - self.assertEqual( - strain.dict()[TAXONOMY][ORGANISM_TYPE], [ - {"code": 2, "name": "Archaea"}] - ) - - strain.taxonomy.organism_type = [OrganismType("Algae")] - self.assertEqual( - strain.dict()[TAXONOMY][ORGANISM_TYPE], [ - {"code": 1, "name": "Algae"}] - ) - - strain.other_numbers.append(StrainId(collection="aaa", number="a")) - strain.other_numbers.append(StrainId(collection="aaa3", number="a3")) - self.assertEqual( - strain.dict()[OTHER_CULTURE_NUMBERS], - [ - {"collection_code": "aaa", "accession_number": "a"}, - {"collection_code": "aaa3", "accession_number": "a3"}, - ], - ) - strain.form_of_supply = ["Agar", "Lyo"] - gen_seq = GenomicSequence() - self.assertEqual(gen_seq.dict(), {}) - gen_seq.marker_id = "pepe" - gen_seq.marker_type = "16S rRNA" - strain.genetics.markers.append(gen_seq) - self.assertEqual( - strain.dict()[GENETICS][MARKERS], - [{"marker_type": "16S rRNA", "INSDC": "pepe"}], - ) - - strain.collect.habitat_ontobiotope = "OBT:111111" - self.assertEqual(strain.collect.habitat_ontobiotope, "OBT:111111") - - try: - strain.collect.habitat_ontobiotope = "OBT:11111" - self.fail() - except ValidationError: - pass - - # publications - try: - strain.publications = 1 - self.fail() - except ValidationError: - pass - pub = Publication() - pub.id = "1" - try: - strain.publications = pub - self.fail() - except ValidationError: - pass - - strain.publications = [pub] - self.assertEqual(strain.publications[0].id, "1") - - strain.catalog_inclusion_date = DateRange(year=1992) - self.assertEqual(strain.dict()[DATE_OF_INCLUSION], '1992----') - - import pprint - - pprint.pprint(strain.dict()) - - def test_strain_validation(self): - strain = Strain() - strain.form_of_supply = ['Lyo'] - - return - - errors = validate_strain(strain) - self.assertEqual(len(errors), 10) - - strain.id.collection = 'test' - strain.id.number = '1' - - - errors = validate_strain(strain) - self.assertEqual(len(errors), 9) - - strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE - strain.restriction_on_use = NO_RESTRICTION - strain.risk_group = 1 - strain.taxonomy.organism_type = [OrganismType(4)] - strain.taxonomy.hybrids = ['Sac lac', 'Sac lcac3'] - strain.growth.recommended_media = ['aa'] - strain.growth.recommended_temp = {'min': 2, 'max':5} - strain.form_of_supply = ['lyo'] - strain.collect.location.country = 'ESP' - errors = validate_strain(strain) - self.assertFalse(errors) - - -class TestIsolation(unittest.TestCase): - def test_iniatialize_isollation(self): - isolation = Isolation() - self.assertEqual(isolation.dict(), {}) - isolation.who = "pepito" - self.assertTrue(ISOLATED_BY in isolation.dict()) - isolation.date = DateRange().strpdate("2012----") - self.assertTrue(DATE_OF_ISOLATION in isolation.dict()) - - try: - isolation.location.site = "spain" - self.fail() - except (ValueError, AttributeError): - pass - - -class TestGenomicSequence(unittest.TestCase): - def test_empty_init(self): - gen_seq = GenomicSequence() - self.assertEqual(gen_seq.dict(), {}) - gen_seq.marker_id = "pepe" - gen_seq.marker_type = "16S rRNA" - self.assertEqual(gen_seq.dict(), { - "marker_type": "16S rRNA", "INSDC": "pepe"}) - - -if __name__ == "__main__": - # import sys;sys.argv = ['', 'TestStrain'] - unittest.main() diff --git a/tests/test_parsers.py b/tests/test_parsers.py deleted file mode 100644 index 96d5f8b..0000000 --- a/tests/test_parsers.py +++ /dev/null @@ -1,51 +0,0 @@ -from mirri.entities.strain import ValidationError -import unittest -from pathlib import Path -from pprint import pprint -from mirri.io.parsers.mirri_excel import parse_mirri_excel - -TEST_DATA_DIR = Path(__file__).parent / "data" - - -class MirriExcelTests(unittest.TestCase): - - def test_mirri_excel_parser(self): - in_path = TEST_DATA_DIR / "valid.mirri.xlsx" - with in_path.open("rb") as fhand: - parsed_data = parse_mirri_excel(fhand, version="20200601") - - medium = parsed_data["growth_media"][0] - self.assertEqual("1", medium.acronym) - self.assertEqual(medium.description, "NUTRIENT BROTH/AGAR I") - - strains = list(parsed_data["strains"]) - strain = strains[0] - self.assertEqual(strain.publications[0].id, 1) - self.assertEqual(strain.publications[0].title, 'Cosa') - self.assertEqual(strain.id.number, "1") - pprint(strain.dict()) - - def xtest_mirri_excel_parser_invalid_fail(self): - in_path = TEST_DATA_DIR / "invalid.mirri.xlsx" - with in_path.open("rb") as fhand: - try: - parse_mirri_excel(fhand, version="20200601") - self.fail() - except ValidationError: - pass - - def xtest_mirri_excel_parser_invalid(self): - in_path = TEST_DATA_DIR / "invalid.mirri.xlsx" - with in_path.open("rb") as fhand: - parsed_data = parse_mirri_excel( - fhand, version="20200601") - - errors = parsed_data["errors"] - for _id, _errors in errors.items(): - print(_id, _errors) - - -if __name__ == "__main__": - # import sys;sys.argv = ['', - # 'MirriExcelTests.test_mirri_excel_parser_invalid'] - unittest.main() diff --git a/tests/test_validation.py b/tests/test_validation.py deleted file mode 100644 index f809a5d..0000000 --- a/tests/test_validation.py +++ /dev/null @@ -1,589 +0,0 @@ -from datetime import datetime -import unittest -from pathlib import Path -from itertools import chain - -from mirri.validation.tags import ( - CHOICES, - COORDINATES, - CROSSREF, - CROSSREF_NAME, - DATE, - MATCH, - MISSING, - MULTIPLE, - NUMBER, - REGEXP, - SEPARATOR, - TAXON, - TYPE, - UNIQUE, - VALUES -) - -from mirri.validation.excel_validator import ( - is_valid_choices, - is_valid_coords, - is_valid_crossrefs, - is_valid_date, - is_valid_missing, - is_valid_number, - is_valid_regex, - is_valid_taxon, - is_valid_unique, - is_valid_file, - validate_mirri_excel, -) - - -TEST_DATA_DIR = Path(__file__).parent / "data" -TS_VALUE = "value" -TS_CONF = "conf" -TS_ASSERT = "assert_func" - - -class MirriExcelValidationTests(unittest.TestCase): - - def test_validation_structure(self): - in_path = TEST_DATA_DIR / "invalid_structure.mirri.xlsx" - with in_path.open("rb") as fhand: - error_log = validate_mirri_excel(fhand) - - entities = [] - err_codes = [] - for ett, errors in error_log.get_errors().items(): - entities.append(ett) - err_codes.extend([err.code for err in errors]) - - self.assertIn("EFS", entities) - self.assertIn("STD", entities) - self.assertIn("GOD", entities) - self.assertIn("GMD", entities) - - self.assertIn("EFS03", err_codes) - self.assertIn("EFS06", err_codes) - self.assertIn("EFS08", err_codes) - self.assertIn("GOD06", err_codes) - self.assertIn("GMD01", err_codes) - self.assertIn("STD05", err_codes) - self.assertIn("STD08", err_codes) - self.assertIn("STD12", err_codes) - - def test_validation_content(self): - in_path = TEST_DATA_DIR / "invalid_content.mirri.xlsx" - with in_path.open("rb") as fhand: - error_log = validate_mirri_excel(fhand) - - entities = [] - err_codes = [] - for ett, errors in error_log.get_errors().items(): - entities.append(ett) - err_codes.extend([err.code for err in errors]) - - self.assertTrue(len(err_codes) > 0) - - self.assertNotIn("EFS", entities) - self.assertIn("STD", entities) - self.assertIn("GOD", entities) - self.assertIn("GID", entities) - - self.assertIn("GOD04", err_codes) - self.assertIn("GOD07", err_codes) - self.assertIn("GID03", err_codes) - self.assertIn("STD11", err_codes) - self.assertIn("STD15", err_codes) - self.assertIn("STD22", err_codes) - self.assertIn("STD04", err_codes) - self.assertIn("STD10", err_codes) - self.assertIn("STD07", err_codes) - self.assertIn("STD14", err_codes) - self.assertIn("STD16", err_codes) - - def test_validation_valid(self): - in_path = TEST_DATA_DIR / "valid.mirri.xlsx" - with in_path.open("rb") as fhand: - error_log = validate_mirri_excel(fhand) - - self.assertTrue(len(error_log.get_errors()) == 0) - - -class ValidatoionFunctionsTest(unittest.TestCase): - - def test_is_valid_regex(self): - tests = [ - { - TS_VALUE: "abcDEF", - TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "123456", - TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "123456", - TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "abcdef", - TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "abc 123", - TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "123 abc", - TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "123 ", - TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_regex(value, conf)) - - def test_is_valid_choices(self): - tests = [ - { - TS_VALUE: "1", - TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "1, 3", - TS_CONF: { - TYPE: CHOICES, - VALUES: ["1", "2", "3", "4"], - MULTIPLE: True, - SEPARATOR: "," - }, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "5", - TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_choices(value, conf)) - - def test_is_valid_crossref(self): - tests = [ - { - TS_VALUE: "abc", - TS_CONF: { - TYPE: CROSSREF, - CROSSREF_NAME: "values", - "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, - }, - TS_ASSERT: self.assertTrue, - }, - { - TS_VALUE: "123", - TS_CONF: { - TYPE: CROSSREF, - CROSSREF_NAME: "values", - "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, - }, - TS_ASSERT: self.assertFalse, - }, - { - TS_VALUE: "abc, def", - TS_CONF: { - TYPE: CROSSREF, - CROSSREF_NAME: "values", - "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, - MULTIPLE: True, - SEPARATOR: ",", - }, - TS_ASSERT: self.assertTrue, - }, - { - TS_VALUE: "abc, 123", - TS_CONF: { - TYPE: CROSSREF, - CROSSREF_NAME: "values", - "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, - MULTIPLE: True, - SEPARATOR: ",", - }, - TS_ASSERT: self.assertFalse, - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_crossrefs(value, conf)) - - def test_is_valid_missing(self): - tests = [ - { - TS_VALUE: 1, - TS_CONF: {TYPE: MISSING}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "abc", - TS_CONF: {TYPE: MISSING}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: None, - TS_CONF: {TYPE: MISSING}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_missing(value, conf)) - - def test_is_valid_date(self): - tests = [ - { - TS_VALUE: '2020-04-07', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: '2020/04/07', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: datetime(2021, 5, 1), - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: '2020-05', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: '2020/05', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 2020, - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: '2021 05 01', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: '04-07-2020', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: '2021-02-31', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: '2021-15', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: '15-2021', - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: 3000, - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: -2020, - TS_CONF: {TYPE: DATE}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_date(value, conf)) - - def test_is_valid_coordinates(self): - tests = [ - { - TS_VALUE: "23; 50", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "-90; -100", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "90; 100", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "0; 0", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "10; 20; 5", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "10; 20; -5", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "91; 50", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "87; 182", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "-200; 182", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "20, 40", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: "abc def", - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: 123, - TS_CONF: {TYPE: COORDINATES}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_coords(value, conf)) - - def test_is_valid_number(self): - tests = [ - { - TS_VALUE: 1, - TS_CONF: {TYPE: NUMBER}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 2.5, - TS_CONF: {TYPE: NUMBER}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "10", - TS_CONF: {TYPE: NUMBER}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "10.5", - TS_CONF: {TYPE: NUMBER}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 5, - TS_CONF: {TYPE: NUMBER, "min": 0}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 5, - TS_CONF: {TYPE: NUMBER, "max": 10}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 5, - TS_CONF: {TYPE: NUMBER, "min": 0, "max": 10}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "hello", - TS_CONF: {TYPE: NUMBER}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: 10, - TS_CONF: {TYPE: NUMBER, "max": 5}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: 0, - TS_CONF: {TYPE: NUMBER, "min": 5}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_number(value, conf)) - - def test_is_valid_taxon(self): - tests = [ - { - TS_VALUE: 'sp. species', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 'spp species subsp. subspecies', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 'spp species subsp. subspecies var. variety', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 'spp taxon', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 'Candidaceae', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: 'sp sp species', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertFalse - }, - { - TS_VALUE: 'spp species abc. def', - TS_CONF: {TYPE: TAXON}, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_taxon(value, conf)) - - def test_is_valid_unique(self): - tests = [ - { - TS_VALUE: "abc", - TS_CONF: { - TYPE: UNIQUE, - "label": "values", - "shown_values": {} - }, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "jkl", - TS_CONF: { - TYPE: UNIQUE, - "label": "values", - "shown_values": { - "values": {"abc": '', - "def": '', - "ghi": ''}, - } - }, - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: "abc", - TS_CONF: { - TYPE: UNIQUE, - "label": "values", - "shown_values": { - "values": {"abc": '', - "def": '', - "ghi": ''}, - } - }, - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - conf = test[TS_CONF] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_unique(value, conf)) - - def test_is_valid_file(self): - tests = [ - { - TS_VALUE: TEST_DATA_DIR / "invalid_structure.mirri.xlsx", - TS_ASSERT: self.assertTrue - }, - { - TS_VALUE: TEST_DATA_DIR / "invalid_excel.mirri.json", - TS_ASSERT: self.assertFalse - }, - ] - - for test in tests: - value = test[TS_VALUE] - assert_func = test[TS_ASSERT] - with self.subTest(value=value): - assert_func(is_valid_file(value,)) - - -if __name__ == "__main__": - import sys - # sys.argv = ['', - # 'ValidatoionFunctionsTest.test_is_valid_regex'] - unittest.main() diff --git a/tests/test_writers.py b/tests/test_writers.py deleted file mode 100644 index 94a8808..0000000 --- a/tests/test_writers.py +++ /dev/null @@ -1,24 +0,0 @@ - -import unittest -from pathlib import Path -from mirri.io.writers.mirri_excel import write_mirri_excel -from mirri.io.parsers.mirri_excel import parse_mirri_excel - -TEST_DATA_DIR = Path(__file__).parent / "data" - - -class MirriExcelTests(unittest.TestCase): - def test_valid_excel(self): - in_path = TEST_DATA_DIR / "valid.mirri.full.xlsx" - parsed_data = parse_mirri_excel(in_path.open('rb'), version="20200601") - strains = parsed_data["strains"] - growth_media = parsed_data["growth_media"] - out_path = Path("/tmp/test.xlsx") - - write_mirri_excel(out_path, strains, growth_media, version="20200601") - - -if __name__ == "__main__": - # import sys;sys.argv = ['', - # 'BiolomicsWriter.test_mirri_excel_parser_invalid'] - unittest.main() diff --git a/mirri/utils.py b/utils.py similarity index 100% rename from mirri/utils.py rename to utils.py diff --git a/validation/2B90F320 b/validation/2B90F320 new file mode 100644 index 0000000..960fd7a Binary files /dev/null and b/validation/2B90F320 differ diff --git a/validation/B3F84180 b/validation/B3F84180 new file mode 100644 index 0000000..c303ee1 Binary files /dev/null and b/validation/B3F84180 differ diff --git a/mirri/validation/__init__.py b/validation/__init__.py similarity index 100% rename from mirri/validation/__init__.py rename to validation/__init__.py diff --git a/validation/__pycache__/__init__.cpython-311.pyc b/validation/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..62112ef Binary files /dev/null and b/validation/__pycache__/__init__.cpython-311.pyc differ diff --git a/validation/__pycache__/excel_validator.cpython-311.pyc b/validation/__pycache__/excel_validator.cpython-311.pyc new file mode 100644 index 0000000..b7c96b7 Binary files /dev/null and b/validation/__pycache__/excel_validator.cpython-311.pyc differ diff --git a/validation/__pycache__/tags.cpython-311.pyc b/validation/__pycache__/tags.cpython-311.pyc new file mode 100644 index 0000000..cc3cf34 Binary files /dev/null and b/validation/__pycache__/tags.cpython-311.pyc differ diff --git a/validation/__pycache__/validate_v5.cpython-311.pyc b/validation/__pycache__/validate_v5.cpython-311.pyc new file mode 100644 index 0000000..ac53165 Binary files /dev/null and b/validation/__pycache__/validate_v5.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_12052023.cpython-311.pyc b/validation/__pycache__/validation_conf_12052023.cpython-311.pyc new file mode 100644 index 0000000..8349360 Binary files /dev/null and b/validation/__pycache__/validation_conf_12052023.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_20200601.cpython-311.pyc b/validation/__pycache__/validation_conf_20200601.cpython-311.pyc new file mode 100644 index 0000000..512676a Binary files /dev/null and b/validation/__pycache__/validation_conf_20200601.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_20200602.cpython-311.pyc b/validation/__pycache__/validation_conf_20200602.cpython-311.pyc new file mode 100644 index 0000000..720bd57 Binary files /dev/null and b/validation/__pycache__/validation_conf_20200602.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_20230224.cpython-311.pyc b/validation/__pycache__/validation_conf_20230224.cpython-311.pyc new file mode 100644 index 0000000..41e2b17 Binary files /dev/null and b/validation/__pycache__/validation_conf_20230224.cpython-311.pyc differ diff --git a/validation/__pycache__/validation_conf_20230324.cpython-311.pyc b/validation/__pycache__/validation_conf_20230324.cpython-311.pyc new file mode 100644 index 0000000..73028ae Binary files /dev/null and b/validation/__pycache__/validation_conf_20230324.cpython-311.pyc differ diff --git a/validation/__pycache__/version_config.cpython-311.pyc b/validation/__pycache__/version_config.cpython-311.pyc new file mode 100644 index 0000000..46d3c0e Binary files /dev/null and b/validation/__pycache__/version_config.cpython-311.pyc differ diff --git a/mirri/validation/error_logging/__init__.py b/validation/error_logging/__init__.py similarity index 100% rename from mirri/validation/error_logging/__init__.py rename to validation/error_logging/__init__.py diff --git a/validation/error_logging/__pycache__/__init__.cpython-311.pyc b/validation/error_logging/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..7e36c7c Binary files /dev/null and b/validation/error_logging/__pycache__/__init__.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error.cpython-311.pyc b/validation/error_logging/__pycache__/error.cpython-311.pyc new file mode 100644 index 0000000..6413d86 Binary files /dev/null and b/validation/error_logging/__pycache__/error.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error_log.cpython-311.pyc b/validation/error_logging/__pycache__/error_log.cpython-311.pyc new file mode 100644 index 0000000..c00d750 Binary files /dev/null and b/validation/error_logging/__pycache__/error_log.cpython-311.pyc differ diff --git a/validation/error_logging/__pycache__/error_message.cpython-311.pyc b/validation/error_logging/__pycache__/error_message.cpython-311.pyc new file mode 100644 index 0000000..0483c6d Binary files /dev/null and b/validation/error_logging/__pycache__/error_message.cpython-311.pyc differ diff --git a/mirri/validation/error_logging/error.py b/validation/error_logging/error.py similarity index 100% rename from mirri/validation/error_logging/error.py rename to validation/error_logging/error.py diff --git a/mirri/validation/error_logging/error_log.py b/validation/error_logging/error_log.py similarity index 100% rename from mirri/validation/error_logging/error_log.py rename to validation/error_logging/error_log.py diff --git a/validation/error_logging/error_message.py b/validation/error_logging/error_message.py new file mode 100644 index 0000000..ad52e62 --- /dev/null +++ b/validation/error_logging/error_message.py @@ -0,0 +1,552 @@ +from typing import Optional + + +class ErrorMessage(): + """Error message + + Args: + code (str): Error code. + pk (str | optional): The instance's primary key that triggered the error. Defaults to None. + value (str | optional): The instance's value that triggered the error. Defaults to None. + """ + + def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None): + self.code = code.upper() + self.pk = pk + self.value = value + + @property + def _codes(self) -> list: + return [ + func + for func in dir(self) + if func.isupper() and + callable(getattr(self, func)) and + not func.startswith("__") + ] + + @property + def _messages(self) -> dict: + return {code: getattr(self, code) for code in self._codes} + + @property + def message(self) -> str: + if not self._validate_code(): + raise ValueError(f"{self.code} not found") + return self._messages[self.code]() + + @property + def code(self) -> str: + return self._code + + @code.setter + def code(self, code: str) -> None: + self._code = code.upper() + + def _validate_code(self) -> bool: + return self.code in self._codes + + @property + def pk(self) -> str: + return self._pk + + @pk.setter + def pk(self, pk: str) -> None: + self._pk = pk + + @property + def value(self) -> str: + return self._value + + @value.setter + def value(self, value: str) -> None: + self._value = value + + """ + Excel File Structure Error Codes + """ + + def EXL00(self): + return f"The provided file '{self.pk}' is not an excel(xlsx) file" + + def EFS01(self): + return "The 'Growth media' sheet is missing. Please check the provided excel template." + + def EFS02(self): + return "The 'Geographic origin' sheet is missing. Please check the provided excel template." + + def EFS03(self): + return "The 'Literature' sheet is missing. Please check the provided excel template." + + def EFS04(self): + return "The 'Sexual state' sheet is missing. Please check the provided excel template." + + def EFS05(self): + return "The 'Strains' sheet is missing. Please check the provided excel template." + + def EFS06(self): + return "The 'Ontobiotope' sheet is missing. Please check the provided excel template." + + def EFS07(self): + return "The 'Markers' sheet is missing. Please check the provided excel template." + + def EFS08(self): + return "The 'Genomic information' sheet is missing. Please check the provided excel template." + + def EFS09(self): + return "The 'Version' sheet is missing. Please check the provided excel template." + + """ + Growth Media Error Codes + """ + + def GMD01(self): + return "The 'Acronym' column is a mandatory field in the Growth Media sheet." + + def GMD02(self): + return "The 'Acronym' column is empty or has missing values." + + def GMD03(self): + return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty." + + def GMD04(self): + return f"The 'Description' for growth media with Acronym {self.pk} is missing." + + """ + Geographic Origin Error Codes + """ + + def GOD01(self): + return "The 'ID' column is a mandatory field in the Geographic Origin sheet." + + def GOD02(self): + return "The 'ID' column is empty or has missing values." + + def GOD03(self): + return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." + + def GOD04(self): + return f"The 'Country' for geographic origin with ID {self.pk} is missing." + + def GOD05(self): + return f"The 'Country' for geographic origin with ID {self.pk} is incorrect." + + def GOD06(self): + return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." + + def GOD07(self): + return f"The 'Locality' for geographic origin with ID {self.pk} is missing." + + """ + Literature Error Codes + """ + + def LID01(self): + return "The 'ID' column is a mandatory field in the Literature sheet." + + def LID02(self): + return "The 'ID' column empty or missing values." + + def LID03(self): + return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty." + + #def LID04(self): + #return f"The 'Full reference' for literature with ID {self.pk} is missing." + + def LID05(self): + return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty." + + #def LID06(self): + #return f"The 'Authors' for literature with ID {self.pk} is missing." + + def LID07(self): + return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty." + + #def LID08(self): + #return f"The 'Title' for literature with ID {self.pk} is missing." + + def LID09(self): + return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty." + + #def LID10(self): + #return f"The 'Journal' for literature with ID {self.pk} is missing." + + def LID11(self): + return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID12(self,): + return f"The 'Year' for literature with ID {self.pk} is missing." + + def LID13(self): + return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID14(self): + return f"The 'Volume' for literature with ID {self.pk} is missing." + + def LID15(self): + return "The 'First page' column is a mandatory field. The column can not be empty." + + def LID16(self): + return f"The 'First page' for literature with ID {self.pk} is missing." + + def LID17(self): + return( f"There are four types of ways to fill in the 'Literature' sheet.", + "1st- Columns 'ID' and 'DOI' must be obrigatory.", + "2nd-Columns 'ID' and 'PMID' are obrigatory.", + "3rd-Columns 'ID' and 'Full reference' are obrigatory.", + "In the alternative of these three types of forms not being filled in, we have:", + "4th-Columns 'ID', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'.") + + def LID18(self): + return "The 'PMID' column is a mandatory field. The column can not be empty." + + #def LID19(self): + #return f"PMID for literature with ID {self.pk} is missing." + + def LID20(self): + return "The 'DOI' column is a mandatory field. The column can not be empty." + + #def LID21(self): + #return f"DOI for literature with ID {self.pk} is missing." + + """ + Strains Error Codes + """ + def STD01(self): + return "The 'accessionNumber' column is a mandatory field in the Strains sheet." + + def STD02(self): + return "The 'accessionNumber' column is empty or has missing values." + + def STD03(self): + return f"The 'accessionNumber' must be unique. The '{self.value}' is repeated." + + def STD04(self): + return (f"The 'accessionNumber' {self.pk} is not according to the specification." + " The value must be of the format ' '.") + + def STD05(self): + return f"The 'useRestrictions' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD06(self): + return f"The 'useRestrictions' for strain with accessionNumber {self.pk} is missing." + + def STD07(self): + return (f"The 'useRestrictions' for strain with accessionNumber {self.pk} is not according to the specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD08(self): + return f"The 'nagoyaConditions' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD09(self): + return f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is missing." + + def STD10(self): + return (f"The 'nagoyaConditions' for strain with accessionNumber {self.pk} is not according to the specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD11(self): + return (f"The 'registeredCollection' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD12(self): + return "The 'riskGroup' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD13(self): + return f"The 'riskGroup' for strain with accessionNumber {self.pk} is missing." + + def STD14(self): + return (f"The 'riskGroup' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.") + + def STD15(self): + return (f"The 'dualUse' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD16(self): + return (f"The “euQuarantine” for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD17(self): + return f"The 'organismType' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD18(self): + return f"The 'organismType' for strain with accessionNumber {self.pk} is missing." + + def STD19(self): + return (f"The 'organismType' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', " + "'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.") + + def STD20(self): + return f"The 'speciesName' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD21(self): + return f"The 'speciesName' for strain with accessionNumber {self.pk} is missing." + + def STD22(self): + return f"The 'speciesName' for strain with accessionNumber {self.pk} is incorrect." + + def STD23(self): + return (f"The 'hybrid' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD24(self): + return (f"The 'depositHistory' for strain with accessionNumber {self.pk} is incorrect." + "The field includes entries separated by '<' meaning 'received from'." + "Entries may include persons or CCs. The name of the CC should be followed by" + "the month, when available, and year of the acquisition. Between parentheses," + "the strain designation or CC numbers and/or a name can also be entered when " + "a name change has occurred.") + + def STD25(self): + return (f"The 'depositDate' for strain with accessionNumber {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD26(self): + return (f"The 'accessionDate' for strain with accessionNumber {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD27(self): + return (f"The 'collectionDate' for strain with accessionNumber {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD28(self): + return (f"The 'isolationDate' for strain with accessionNumber {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD29(self): + return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect." + " It must have two decimal numbers separated by ','") + + def STD30(self): + return f"The 'temperatureGrowthRange' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD31(self): + return f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is missing." + + def STD32(self): + return (f"The 'temperatureGrowthRange' for strain with accessionNumber {self.pk} is incorrect." + " It must have two decimal numbers separated by ','.") + + def STD33(self): + return ("The 'recommendedTemperature' column is a mandatory field in the Strains Sheet. The column can not be empty.") + + def STD34(self): + return f"The 'recommendedTemperature' for strain with accessionNumber {self.pk} is missing." + + def STD35(self): + return f"The value of 'recommendedTemperature' for strain with accessionNumber {self.pk} is not in the Growth Media Sheet." + + def STD36(self): + return f"The 'supplyForms' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD37(self): + return f"The 'supplyForms' for strain with accessionNumber {self.pk} is missing." + + def STD38(self): + return f"The value of 'supplyForms' for strain with accessionNumber {self.pk} is not in the Forms of Supply Sheet." + + def STD39(self): + return (f"The 'geographicCoordinates' column for strain with accessionNumber {self.pk} is incorrect." + "The allowed formats are two, three or four decimal numbers separated by ','. Moreover, the first number must be." + "between [-90, 90], the second between [-180, 180], and the third and fourth refers to the precision and altitude, defined by decimal numbers." + "Put a question mark for lack of precision or altitude when one of them is missing. Leave the values blank when both are missing. ") + + def STD40(self): + return (f"The 'country' column for strain with accessionNumber {self.pk} is incorrect." + "The allowed formats are one decimal number between [-200, 8000].") + def STD54(self): + return (f"The 'country'column is a mandatory field in the Strains Sheet. The column can not be empty.") + def STD55(self): + return (f"The 'country' for strain with accessionNumber {self.pk} is missing.") + + def STD41(self): + return f"The value of 'ontobiotopeTerms' for strain with accessionNumber {self.pk} is not in the Ontobiotope Sheet." + + def STD42(self): + return (f"The 'gmo' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2") + + def STD43(self): + return (f"The 'sexualState' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', " + "'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'") + + def STD44(self): + return (f"The 'ploidy' for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9") + + def STD45(self): + msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. " + msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage" + return msg + + def STD46(self): + return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is not according to specification." + f"The 'geographicOrigin' column must consist of the ID's associated with the Geographic origin sheet.") + + def STD47(self): + return "The 'country' column is a mandatory field in the Strains sheet." + + def STD48(self): + return "The 'country' column is empty or has missing values." + + def STD49(self): + return (f"The “qps” for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD50(self): + return (f"The “axenicCulture” for strain with accessionNumber {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Axenic', 'Not axenic'.") + + def STD51(self): + return f"The 'mirriAccessionNumber' must be unique. The '{self.pk}' is repeated." + + def STD52(self): + return (f"The 'mirriAccessionNumber' for strain with accessionNumber {self.pk} is incorrect." + " It must have the expression MIRRI followed by 7 digits") + + def STD53(self): + return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect." + " The displayed expression it should be composed of: site name ';' website url." ) + + def STD56(self): + return (f"The 'siteLinks' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. " ) + def STD57(self): + return (f"The 'country' for strain with accessionNumber {self.pk} is incorrect." + "This information must be expressed by using the ISO-3166 standard for country" + "codes. The preferred set is ISO 3166-1 alpha-2 (two letters code), but ISO 3166-" + "1 alpha-3 (three letters code) is also accepted. Former country codes must" + "follow standard’s part three ISO 3166-3 (four letters code). Only one code can" + "be included." ) + def STD58(self): + return (f"The 'mtaFile' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. " ) + def STD59(self): + return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect." + "The displayed expression it should be composed of: name ';' website url." + "When only one URL is provided, the title may be omitted. In this case, the URL" + "will be shown in clear to users." ) + def STD60(self): + return (f"The 'absFile' for strain with accessionNumber {self.pk} is incorrect." + " The url must be valid. ") + def STD61(self): + return (f"The 'sequenceLiterature' for strain with accessionNumber {self.pk} is incorrect." + "Numeric identifiers separated by a semicolon ';'.") + + def STD62(self): + return (f"The 'plasmidCollections' for strain with accessionNumber {self.pk} is incorrect." + "It should include the name of the plasmid followed by the CC number in" + "parentheses. More than one plasmid can be reported, separated by ';'. " + "Plasmid names should be provided as free text." + "CC numbers should be composed by the CC acronym followed by a number" + "separated by a space'. Numeric identifiers separated by a semicolon ';'.") + + def STD63(self): + return (f"The 'otherCollectionNumbers' for strain with accessionNumber {self.pk} is incorrect." + " The value must be of the format ' '.") + + def STD64(self): + return (f"The 'type' for strain with accessionNumber {self.pk} is incorrect." + f"Your value is {self.value} and the accepted values are 1, 2.") + + def STD64(self): + return (f"The 'status' for strain with accessionNumber {self.pk} is incorrect." + "When the type equal 2 the status must contain the type," + "if the type equal 2 the status must have a null value.") + + def STD65(self): + return (f"The 'status' for strain with accessionNumber {self.pk} is incorrect." + "The structure should be 'type of .") + + def STD68(self): + return (f"The 'geographicOrigin'column is a mandatory field in the Strains Sheet. The column can not be empty.") + + def STD69(self): + return (f"The 'geographicOrigin' for strain with accessionNumber {self.pk} is missing.") + + """ + Genomic Information Error Codes + """ + + def GID01(self): + return f"The 'Strain accessionNumber' (Strain AN) column is a mandatory field in the Genomic Information Sheet." + + def GID02(self): + return f"The 'Strain accessionNumber' (Strain AN) column is empty or has missing values." + + def GID03(self): + return f"The value of 'Strain accessionNumber' (Strain AN) {self.value} is not in the Strains sheet." + + def GID04(self): + return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." + + def GID05(self): + return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing." + + def GID06(self): + return f"The value of 'Marker' {self.value} is not in the Markers sheet." + + def GID07(self): + return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." + + def GID08(self): + return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing." + + def GID09(self): + return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect." + + def GID10(self): + return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." + " It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.") + + def GID11(self): + return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." + "An INSDC accession number is an alphanumeric" + "code made by a fixed number of letters followed by a fixed number of digits," + "without any separation. For sequences, the code is currently made of two" + "letters followed by six numbers.") + + + """ + Ontobiotope Error Codes + """ + + def CTR01(self): + return "The 'Version' columns is a mandatory field in the Version Sheet." + + def CTR02(self): + return "The 'Version' columns is empty or has missing values." + + def CTR03(self): + return "The 'Date' columns is a mandatory field in the Control Sheet." + + def CTR04(self): + return "The 'Date' columns is empty or has missing values." + + def CTR05(self): + return f"The version {self.value} is the only one to be used." + + + + """ + Ontobiotope Error Codes + """ + + def OTD01(self): + return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet." + + def OTD02(self): + return "The 'ID' columns is empty or has missing values." + + #def OTD03(self): + return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty." + + #def OTD04(self): + return f"The 'Name' for ontobiotope with ID {self.pk} is missing." + + + + \ No newline at end of file diff --git a/mirri/validation/excel_validator.py b/validation/excel_validator.py similarity index 63% rename from mirri/validation/excel_validator.py rename to validation/excel_validator.py index 3b8e946..c53ddec 100644 --- a/mirri/validation/excel_validator.py +++ b/validation/excel_validator.py @@ -4,26 +4,51 @@ from io import BytesIO from zipfile import BadZipfile from datetime import datetime from calendar import monthrange - +import requests from openpyxl import load_workbook +import pycountry from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet from mirri.validation.error_logging import ErrorLog, Error from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, ERROR_CODE, FIELD, MANDATORY, MATCH, - MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, - TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO) + MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, + TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO, DOMINIO,URL_DOMINIO, ISO, URL_TITLE,JUST_URL,TITLE, + HISTORY,NAGOYA1, VERSION) from mirri.settings import LOCATIONS, SUBTAXAS -from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF +from mirri.settings_v1 import LOCATIONS, SUBTAXAS +from mirri.validation.validation_conf_12052023 import version_config -def validate_mirri_excel(fhand, version="20200601"): - if version == "20200601": - configuration = MIRRI_20200601_VALLIDATION_CONF - else: - raise NotImplementedError("Only version20200601 is implemented") - +def validate_mirri_excel(fhand, version="", date=""): + configuration = version_config.get(version) + if configuration is None: + raise NotImplementedError("Unsupported version: " + version) + configuration["date"] = date or configuration.get("date") + if configuration["date"] != "12/05/2023": + raise ValueError("Invalid date. Expected: 12/05/2023") return validate_excel(fhand, configuration) + + +def version(value , validation_conf=None): + if value is None: + return True + try: + for version in version_config: + if value == version : + return True + except: + return False + + +def validate_country_code(value,validation_conf=None): + if value is None: + return True + try: + if pycountry.countries.get(alpha_2=value) or pycountry.countries.get(alpha_3=value) or pycountry.historic_countries.get(alpha_4 = value): + return True + except: + return False def validate_excel(fhand, configuration): @@ -185,11 +210,16 @@ def validate_row(row, validation_steps, in_memory_sheets): kind = validation_step[TYPE] error_code = validation_step[ERROR_CODE] if kind == NAGOYA: - if not is_valid_nagoya(row, in_memory_sheets): + if not is_valid_nagoya_v20200601(row, in_memory_sheets): + return error_code + if not is_valid_nagoya_v12052023(row, in_memory_sheets): return error_code elif kind == BIBLIO: if not is_valid_pub(row): return error_code + elif kind == NAGOYA1: + if not is_valid_nago(row): + return error_code else: msg = f'{kind} is not a recognized row validation type method' raise NotImplementedError(msg) @@ -207,35 +237,58 @@ def validate_cell(value, validation_steps, crossrefs, shown_values, label): if error_code is not None: return error_code - - + def is_valid_pub(row): + pub_id = row.get('ID', None) + pub_pmid = row.get('PMID', None) + pub_doi = row.get('DOI', None) title = row.get('Title', None) full_reference = row.get('Full reference', None) authors = row.get('Authors', None) journal = row.get('Journal', None) year = row.get('Year', None) - volumen = row.get('Volumen', None) + volumen = row.get('Volume', None) first_page = row.get('First page', None) book_title = row.get('Book title', None) editors = row.get('Editors', None) publishers = row.get('Publishers', None) - if full_reference: + if (pub_id != None and pub_doi != None) or (pub_id != None and pub_pmid != None) or (pub_id != None and full_reference != None) or (pub_id != None and authors != None and title != None and journal != None and year != None and volumen != None and first_page != None) : return True is_journal = bool(title) - if (is_journal and (not authors or not journal or not not year or - not volumen or not first_page)): - return False - if (not is_journal and (not authors or not year or - not editors or not publishers or not book_title)): - return False + # if (is_journal and (not authors or not journal or not not year or + # not volumen or not first_page)): + # return False + #if (not is_journal and (not authors or not year or + # not editors or not publishers or not book_title)): + # return False + return False + +def is_valid_nago(row): + if not row: + return True + status = row.get("status", None) + type = row.get("type", None) + regex = r'^[a-zA-Z\s.\'-]+$' + + if status != None and type != None: + if (re.match(regex, status) and type==1): + return False + if (type == 2 and status is None): + return False return True +def parsee_mirri_excel(row, in_memory_sheets, version=""): + if version == "20200601": + return is_valid_nagoya_v20200601 (row, in_memory_sheets) + elif version == "12052023": + return is_valid_nagoya_v12052023 (row, in_memory_sheets) + else: + raise NotImplementedError("Only versions 20200601 and 12052023 are implemented") -def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity +def is_valid_nagoya_v20200601(row, in_memory_sheets): # sourcery skip: return-identity location_index = row.get('Geographic origin', None) if location_index is None: country = None @@ -257,9 +310,36 @@ def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity if year is not None and year >= 2014 and country is None: return False + + return True +def is_valid_nagoya_v12052023(row, in_memory_sheets): # sourcery skip: return-identity + location_index = row.get('geographicOrigin', None) + if location_index is None: + country = None + else: + geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {}) + country = geo_origin.get('Country', None) + + _date = row.get("collectionDate", None) + if _date is None: + _date = row.get("isolationDate", None) + if _date is None: + _date = row.get("depositDate", None) + if _date is None: + _date = row.get("accessionDate", None) + if _date is not None: + year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4]) + else: + year = None + + if year is not None and year >= 2014 and country is None: + return False + + + return True def is_valid_regex(value, validation_conf): if value is None: @@ -310,7 +390,9 @@ def is_valid_choices(value, validation_conf): values = [v.strip() for v in str(value).split(separator)] else: values = [str(value).strip()] - + sorted_values = sorted(values) + if sorted_values != values: + return False return all(value in choices for value in values) @@ -352,47 +434,145 @@ def is_valid_date(value, validation_conf): return True -def is_valid_coords(value, validation_conf=None): - # sourcery skip: return-identity +def is_valid_dominio(value, validation_conf=None): if value is None: return True try: items = [i.strip() for i in value.split(";")] - latitude = float(items[0]) - longitude = float(items[1]) - if len(items) > 2: - precision = float(items[2]) - if latitude < -90 or latitude > 90: - return False - if longitude < -180 or longitude > 180: - return False + if len(items) >1: + for i in range(0, len(items),2): + nameSite = str(items[i]) + urlSite = str(items[i+1]) + dominio = urlSite.split(".")[-2] + if nameSite.lower() != dominio: + return False + return True except: - return False + return False + +def is_valid_title(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + if len(items) >1: + for i in range(0, len(items),2): + nameSite = (items[i]) + urlSite = str(items[i+1]) + regex = r'^(http|https):\/\/[a-z0-9\-\.]+\.[a-z]{2,}([/a-z0-9\-\.]*)*$' + if re.match(regex, nameSite) or isinstance(nameSite, int) or nameSite == '': + return False + return True + except: + return False + +def is_valid_url_title(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + if len(items) ==1: + urlSite = str(items[0]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + else: + items = [i.strip() for i in value.split(";")] + for i in range(0, len(items),2): + nameSite = (items[i]) + urlSite = str(items[i+1]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + + return True + except: + return False + + +def is_valid_url_dominio(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + for i in range(0, len(items),2): + nameSite = str(items[i]) + urlSite = str(items[i+1]) + response = requests.head(urlSite) + if response.status_code != 200: + return False + + return True + except: + return False + + +def is_valid_just_url(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + for i in items: + nameSite = str(items[0]) + response = requests.head(i) + if response.status_code != 200: + return False + + return True + except: + return False + + +def is_valid_history(value, validation_conf=None): + if value is None: + return True + try: + items = [i.strip() for i in value.split("<")] + for i in items: + regex1 = r'^[a-zA-Z &,;.''-]+, ((19|20)\d{2})' + regex2 = r'^[a-zA-Z &,;.''-]+, [a-zA-Z &,;.''-] (19|20)\d{2}\s\([a-zA-Z &,;.''-]+\)' + regex3 = r'^[a-zA-Z &,;.''-]+\, [a-zA-Z &,;.''-]' + regex4 = r'^[a-zA-Z &,;.''-]+, (19|20)\d{2}\s\([a-zA-Z .''-,;&]+\)' + regex5 = r'^[a-zA-Z &,;.''-]+, \([a-zA-Z &,;.''-]+\) (19|20)\d{2}' + if re.match(regex1, i): + return True + elif re.match(regex2, i): + return True + elif re.match(regex3, i): + return True + elif re.match(regex4, i): + return True + elif re.match(regex5, i): + return True + else: + return False + except: + return False + + +def is_valid_coords(value, validation_conf=None): + if value is None: + return True + try: + + regex1 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?);-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?)$' + regex2 = r'^-?(90(\.0+)?|[1-8]?\d(\.\d+)?);-?(180(\.0+)?|((1[0-7]\d)|(\d{1,2}))(\.\d+)?);(\d+\.\d+|\?);(\d+\.\d+|\?)$|^(\d+\.\d+|\?)$|^;$' + + if not re.match(regex1, value) and not re.match(regex2, value): + return False + + return True + except: + return False def is_valid_missing(value, validation_conf=None): return value is not None -def is_valid_number(value, validation_conf): - if value is None: - return True - try: - value = float(value) - except TypeError: - return False - except ValueError: - return False - - _max = validation_conf.get('max', None) - _min = validation_conf.get('min', None) - if (_max is not None and value > _max) or (_min is not None and value < _min): - return False - - return True - - def is_valid_taxon(value, validation_conf=None): multiple = validation_conf.get(MULTIPLE, False) separator = validation_conf.get(SEPARATOR, ';') @@ -429,6 +609,8 @@ def _is_valid_taxon(value): def is_valid_unique(value, validation_conf): + if not value: + return True label = validation_conf['label'] shown_values = validation_conf['shown_values'] if label not in shown_values: @@ -444,7 +626,6 @@ def is_valid_unique(value, validation_conf): return True - def is_valid_file(path): try: with path.open("rb") as fhand: @@ -464,8 +645,15 @@ VALIDATION_FUNCTIONS = { CROSSREF: is_valid_crossrefs, DATE: is_valid_date, COORDINATES: is_valid_coords, - NUMBER: is_valid_number, TAXON: is_valid_taxon, + TITLE: is_valid_title, + DOMINIO: is_valid_dominio, + URL_TITLE: is_valid_url_title, + URL_DOMINIO: is_valid_url_dominio, + JUST_URL: is_valid_just_url, + ISO: validate_country_code, + HISTORY: is_valid_history, + VERSION: version, UNIQUE: is_valid_unique} diff --git a/mirri/validation/tags.py b/validation/tags.py similarity index 62% rename from mirri/validation/tags.py rename to validation/tags.py index ef036c9..9fb35e0 100644 --- a/mirri/validation/tags.py +++ b/validation/tags.py @@ -16,9 +16,20 @@ MATCH = 'match' VALUES = 'values' DATE = 'date' COORDINATES = 'coord' +COORDINATES1 = 'coord1' NUMBER = 'number' TAXON = 'taxon' UNIQUE = 'unique' ROW_VALIDATION = 'row_validation' NAGOYA = 'nagoya' BIBLIO = 'bibliography' +DOMINIO= 'is_valid_dominio' +TITLE= 'is_valid_title' +URL_DOMINIO = 'urll_valid_dominio' +URL_TITLE= 'is_valid_url_title' +ISO = 'validate_country_code' +JUST_URL= 'is_valid_just_url' +HISTORY= 'is_valid_history' +MEU='is_valid_crossrefs_meu' +NAGOYA1 = 'nayoga1' +VERSION = 'version' \ No newline at end of file diff --git a/bin/validate.py b/validation/validate_v5.py similarity index 57% rename from bin/validate.py rename to validation/validate_v5.py index 86a10fc..3cd828a 100644 --- a/bin/validate.py +++ b/validation/validate_v5.py @@ -1,14 +1,21 @@ #!/usr/bin/env python +import pandas as pd import sys from pathlib import Path -from mirri.validation.excel_validator import validate_mirri_excel -import warnings +import warnings warnings.simplefilter("ignore") - +from mirri.validation.excel_validator import validate_mirri_excel def main(): path = Path(sys.argv[1]) - error_log = validate_mirri_excel(path.open("rb")) + version = str(sys.argv[2]) + date = str(sys.argv[3]) + try: + + error_log = validate_mirri_excel(path.open("rb"), version=version, date=date) + + except NotImplementedError as e: + print(e) for errors in error_log.get_errors().values(): for error in errors: @@ -16,4 +23,4 @@ def main(): if __name__ == "__main__": - main() + main() \ No newline at end of file diff --git a/validation/validation_conf_12052023.py b/validation/validation_conf_12052023.py new file mode 100644 index 0000000..74c3a54 --- /dev/null +++ b/validation/validation_conf_12052023.py @@ -0,0 +1,675 @@ +from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, + ERROR_CODE, FIELD, MANDATORY, MATCH, + MISSING, MULTIPLE, NAGOYA, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, + UNIQUE,VERSION, + VALIDATION, VALUES, BIBLIO, DOMINIO, URL_DOMINIO,ISO, JUST_URL, URL_TITLE, TITLE, HISTORY,NAGOYA1) +from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, + STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS, CONTROL_SHEET,) + + + +# GEOGRAPHIC_ORIGIN +# SEXUAL_STATE_SHEET, +# RESOURCE_TYPES_VALUES, +# FORM_OF_SUPPLY_SHEET, +# PLOIDY_SHEET) + + + + +STRAIN_FIELDS = [ + + { + FIELD: "accessionNumber", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: 'STD01'}, + {TYPE: UNIQUE, ERROR_CODE: 'STD03'}, + {TYPE: MISSING, ERROR_CODE: "STD02"}, + {TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"} + ] + }, + { + FIELD: "useRestrictions", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD05"}, + {TYPE: MISSING, ERROR_CODE: "STD06"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3"], + MULTIPLE: False, ERROR_CODE: "STD07"} + ] + }, + { + FIELD: "mirriAccessionNumber", + VALIDATION: [ + {TYPE: UNIQUE, ERROR_CODE: 'STD51'}, + {TYPE: REGEXP, MATCH: "^MIRRI[0-9]{7}$", ERROR_CODE: "STD52"}, + ], + }, + + { + FIELD: "nagoyaConditions", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD08"}, + {TYPE: MISSING, ERROR_CODE: "STD09"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3"], + MULTIPLE: False, ERROR_CODE: "STD10"} + ] + }, + { + FIELD: "absFile", + VALIDATION: [ + {TYPE: TITLE, ERROR_CODE: "STD59"}, + {TYPE: URL_TITLE, ERROR_CODE: "STD60", + MULTIPLE: True, SEPARATOR: ";"}, + ], + }, + + { + FIELD: "siteLinks", + VALIDATION: [ + {TYPE: DOMINIO, ERROR_CODE: "STD53", + MULTIPLE: False, SEPARATOR: ";"}, + {TYPE: URL_DOMINIO, ERROR_CODE: "STD56", + MULTIPLE: False, SEPARATOR: ";"}, + ], + }, + { + FIELD: "mtaFile", + VALIDATION: [ + {TYPE: JUST_URL, ERROR_CODE: "STD58", + MULTIPLE: True, SEPARATOR: ";"}, + ], + }, + { + FIELD: "otherCollectionNumbers", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "([^ ]* [^ ]*)(; [^ ]* [^ ]*)*$", ERROR_CODE: "STD63"}, + #{TYPE: CROSSREF, CROSSREF_NAME: "Strains", ERROR_CODE: "STD64"}, + ] + }, + { + FIELD: "registeredCollection", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD11"} + ] + }, + { + FIELD: "type", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], ERROR_CODE: "STD64"}, + ] + }, + { + FIELD: "riskGroup", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD12"}, + {TYPE: MISSING, ERROR_CODE: "STD13"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"], + MULTIPLE: False, ERROR_CODE: "STD14"} + ] + }, + { + FIELD: "dualUse", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD15"} + ] + }, + { + FIELD: "euQuarantine", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD16"} + ] + }, + { + FIELD: "axenicCulture", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["Axenic", "Not axenic"], + ERROR_CODE: "STD50"} + ] + }, + + { + FIELD: "organismType", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD17"}, + {TYPE: MISSING, ERROR_CODE: "STD18"}, + {TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria", + "Cyanobacteria", "Filamentous Fungi", "Filamentous fungi", + "Yeast", "Microalgae", + "1", "2", "3", "4", "5", "6", "7"], + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"} + ] + }, + { + FIELD: "speciesName", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD20"}, + {TYPE: MISSING, ERROR_CODE: "STD21"}, + {TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True, + SEPARATOR: ';'} + ] + }, + { + FIELD: "infrasubspecificNames", + VALIDATION: [] + }, + { + FIELD: "taxonomyComments", + VALIDATION: [] + }, + { + FIELD: "hybrid", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD23"} + ] + }, + { + FIELD: "status", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "^(type of|neotype of|holotype of |epitype of) ([a-zA-Z .'-]+)$", ERROR_CODE: "STD65"}, + + ] + + }, + { + FIELD: "depositHistory", + VALIDATION: [ + {TYPE: HISTORY, ERROR_CODE: 'STD24'}, + ] + }, + { + FIELD: "depositor", + VALIDATION: [] + }, + { + FIELD: "depositDate", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD25"}, + ] + }, + { + FIELD: "accessionDate", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD26"}, + ] + }, + { + FIELD: "collector", + VALIDATION: [] + }, + + + { + FIELD: "substrate", + VALIDATION: [] + }, + { + FIELD: "temperatureGrowthRange", + VALIDATION: [ + {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', + ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"} + ] + }, + { + FIELD: "recommendedTemperature", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD30"}, + {TYPE: MISSING, ERROR_CODE: "STD31"}, + {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', + ERROR_CODE: "STD32", + MULTIPLE: True, SEPARATOR: ";"} + ] + }, + + { + FIELD: "supplyForms", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD36"}, + {TYPE: MISSING, ERROR_CODE: "STD37"}, + {TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium', + 'Lyo', 'Oil', 'Water'], + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"} + ] + }, + { + FIELD: "otherDenomination", + VALIDATION: [] + }, + { + FIELD: "geographicCoordinates", + VALIDATION: [ + {TYPE: COORDINATES, ERROR_CODE: "STD39"}, + + ] + }, + + { + # value can be in the cell or in another sheet. Don't configure this + FIELD: "geographicOrigin", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD68"}, + {TYPE: MISSING, ERROR_CODE: "STD69"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Geographic origin", ERROR_CODE: "STD46"}, + ] + }, + + { + FIELD: "isolationHabitat", + VALIDATION: [] + }, + { + FIELD: "ontobiotopeTerms", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope", + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"} + ] + }, + { + FIELD: "qps", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD49"} + ] + }, + { + FIELD: "gmo", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD42"} + ] + }, + { + FIELD: "gmoConstruction", + VALIDATION: [] + }, + { + FIELD: "mutant", + VALIDATION: [] + }, + { + FIELD: "genotype", + VALIDATION: [] + }, + { + FIELD: "Plant pathogenicity code", + VALIDATION: [] + }, + { + FIELD: "sexualState", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET, + ERROR_CODE: "STD43"} + # {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha", + # "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha", + # "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"], + # ERROR_CODE: "STD43"} + ] + }, + { + FIELD: "ploidy", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2", "3", "4", "5", "9"], + ERROR_CODE: "STD44"} + ] + }, + { + FIELD: "plasmids", + VALIDATION: [] + }, + { + FIELD: "plasmidCollections", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\)(; ([a-zA-Z .'-]+)\(([a-zA-Z .'-]+) (\d+)\))*$", + ERROR_CODE: "STD62"} + ] + }, + { + # value can be in the cell or in another sheet. Don't configure this + FIELD: "identificationLiterature", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET, + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"} + ] + }, + { + FIELD: "pathogenicity", + VALIDATION: [] + }, + { + FIELD: "enzymes", + VALIDATION: [] + }, + { + FIELD: "metabolites", + VALIDATION: [] + }, + { + FIELD: "applications", + VALIDATION: [] + }, + { + FIELD: "remarks", + VALIDATION: [] + }, + { + FIELD: "sequenceLiterature", + VALIDATION: [ + {TYPE: REGEXP, MATCH: "^\d+(; \d+)*$", ERROR_CODE: "STD61"}, + ] + + }, + + { + FIELD: "recommendedMedium", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD33"}, + {TYPE: MISSING, ERROR_CODE: "STD34"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Growth media", + MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"} + ] + }, + + + { + FIELD: "country", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD54"}, + {TYPE: MISSING, ERROR_CODE: "STD55"}, + {TYPE: ISO, ERROR_CODE: "STD57"}, + #{TYPE: CROSSREF, CROSSREF_NAME: COUNTRY_CODES_SHEET, ERROR_CODE: "STD57"} + ] + }, +] +SHEETS_SCHEMA = { + LOCATIONS: { + "acronym": "GOD", + "id_field": "ID", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"}, + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD01"}, + {TYPE: MISSING, ERROR_CODE: "GOD02"}, + ] + }, + { + FIELD: "Country", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD03"}, + {TYPE: MISSING, ERROR_CODE: "GOD04"}, + ] + }, + { + FIELD: "Region", + VALIDATION: [] + }, + { + FIELD: "City", + VALIDATION: [] + }, + { + FIELD: "Locality", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD06"}, + {TYPE: MISSING, ERROR_CODE: "GOD07"} + ] + } + ], + }, + GROWTH_MEDIA: { + "acronym": "GMD", + "id_field": "Acronym", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"}, + COLUMNS: [ + { + FIELD: "Acronym", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GMD01"}, + {TYPE: MISSING, ERROR_CODE: "GMD02"} + ] + }, + { + FIELD: "Description", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GMD03"}, + {TYPE: MISSING, ERROR_CODE: "GMD04"} + ] + }, + { + FIELD: "Full description", + VALIDATION: [] + }, + ], + }, + GENOMIC_INFO: { + "acronym": "GID", + "id_field": "Strain AN", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"}, + COLUMNS: [ + { + FIELD: "Strain AN", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID01"}, + {TYPE: MISSING, ERROR_CODE: "GID02"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Strains", + ERROR_CODE: "GID03"}, + ] + }, + { + FIELD: "Marker", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID04"}, + {TYPE: MISSING, ERROR_CODE: "GID05"}, + {TYPE: CROSSREF, CROSSREF_NAME: MARKERS, ERROR_CODE: "GID06"} + ] + }, + { + FIELD: "INSDC AN", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID07"}, + {TYPE: MISSING, ERROR_CODE: "GID08"}, + {TYPE: REGEXP, MATCH: "^[A-Z]{2}[0-9]{6}$", ERROR_CODE: "GID11"}, + ] + }, + { + FIELD: "Sequence", + VALIDATION: [] + }, + ], + }, + STRAINS: { + "acronym": "STD", + 'id_field': 'accessionNumber', + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"}, + ROW_VALIDATION: [ + #{TYPE: NAGOYA, ERROR_CODE: "STD46"}, + {TYPE: NAGOYA1, ERROR_CODE: "STD64"} + ], + COLUMNS: STRAIN_FIELDS, + }, + LITERATURE_SHEET: { + "acronym": "LID", + 'id_field': 'ID', + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"}, + ROW_VALIDATION: [ + {TYPE: BIBLIO, ERROR_CODE: 'LID17'} + ], + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID01"}, + {TYPE: MISSING, ERROR_CODE: "LID02"}, + ] + }, + { + FIELD: "PMID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID18"}, + ] + }, + { + FIELD: "DOI", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID20"}, + ] + }, + { + FIELD: "Full reference", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID03"}, + ] + }, + { + FIELD: "Authors", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID05"}, + ] + }, + { + FIELD: "Title", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID07"}, + ] + }, + { + FIELD: "Journal", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID09"}, + ] + }, + { + FIELD: "Year", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID11"}, + ] + }, + { + FIELD: "Volume", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID13"}, + ] + }, + { + FIELD: "Issue", + VALIDATION: [] + }, + { + FIELD: "First page", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID15"}, + ] + }, + { + FIELD: "Last page", + VALIDATION: [] + }, + { + FIELD: "Book title", + VALIDATION: [] + }, + { + FIELD: "Editors", + VALIDATION: [] + }, + { + FIELD: "Publisher", + VALIDATION: [] + } + ], + }, + # SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []}, + # RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []}, + # FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []}, + # PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []}, + ONTOBIOTOPE: { + "acronym": "OTD", + "id_field": "ID", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"}, + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "OTD01"}, + {TYPE: MISSING, ERROR_CODE: "OTD02"}, + ] + }, + { + FIELD: "Name", + VALIDATION: [] + }, + ] + }, + + + + + + CONTROL_SHEET: { + "acronym": "CTR", + "id_field": "Version", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS09"}, + COLUMNS: [ + { + FIELD: "Version", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "CTR01"}, + {TYPE: MISSING, ERROR_CODE: "CTR02"}, + {TYPE: VERSION, ERROR_CODE: "CTR05"}, + ] + }, + { + FIELD: "Date", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "CTR03"}, + {TYPE: MISSING, ERROR_CODE: "CTR04"}, + ] + }, + ] + }, + + MARKERS: { + "acronym": "MKD", + "id_field": "Acronym", + COLUMNS: [ + { + FIELD: "Acronym", + VALIDATION: [] + }, + { + FIELD: "Marker", + VALIDATION: [] + }, + ], + }, +} + + +CROSS_REF_CONF = { + ONTOBIOTOPE: ['ID'], + LITERATURE_SHEET: ['ID', 'DOI', 'PMID', 'Full reference', 'Authors', 'Title', 'Journal', 'Year', 'Volume', 'First page'], + LOCATIONS: ['ID', 'Locality'], + GROWTH_MEDIA: ['Acronym'], + STRAINS: ["accessionNumber"], + SEXUAL_STATE_SHEET: [], + MARKERS: ["Acronym"], + +} + +MIRRI_12052023_VALLIDATION_CONF = { + 'sheet_schema': SHEETS_SCHEMA, + 'cross_ref_conf': CROSS_REF_CONF, + 'keep_sheets_in_memory': [ + {'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}] +} + +version_config = { + '5.1.2': MIRRI_12052023_VALLIDATION_CONF, + 'date': '12/05/2023' + +} + diff --git a/mirri/validation/validation_conf_20200601.py b/validation/validation_conf_20200601.py similarity index 99% rename from mirri/validation/validation_conf_20200601.py rename to validation/validation_conf_20200601.py index 1d9752c..5f667a9 100644 --- a/mirri/validation/validation_conf_20200601.py +++ b/validation/validation_conf_20200601.py @@ -3,7 +3,7 @@ from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROS MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO) -from mirri.settings import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, +from mirri.settings_v1 import (ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET, MARKERS) # GEOGRAPHIC_ORIGIN # SEXUAL_STATE_SHEET,