commit 332876f58ce27a8b062d083e317470d50ba2e12b Author: Jose Miguel López-Coronado Date: Fri Feb 18 12:09:05 2022 +0100 First import diff --git a/README.md b/README.md new file mode 100644 index 0000000..5e20387 --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# MIRRI Utils + +## Installation + +> pip install path_to_package.tar.gz + + +## Description + +A small set of utilities to deal with Mirri Data. + + - A data class to deal with strain data. + + - An excel reader for mirri specification + + - An excel validator for mirri specification + + - An excel writer to create the excel with MIRRI specifications + diff --git a/bin/delete_duplicated_strain_by_number.py b/bin/delete_duplicated_strain_by_number.py new file mode 100644 index 0000000..6c8bc46 --- /dev/null +++ b/bin/delete_duplicated_strain_by_number.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import argparse +import sys + +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS + +SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' + + +def get_cmd_args(): + desc = "Upload strains to MIRRI-IS" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('-a', '--accession_number', required=True, + help='Delete the duplicated items in database for the given accession number') + parser.add_argument('-u', '--ws_user', help='Username of the web service', + required=True) + parser.add_argument('-p', '--ws_password', required=True, + help='Password of the web service user') + parser.add_argument('-c', '--client_id', required=True, + help='Client id of the web service') + parser.add_argument('-s', '--client_secret', required=True, + help='Client secret of the web service') + + args = parser.parse_args() + + return {'accession_number': args.accession_number, 'user': args.ws_user, + 'password': args.ws_password, 'client_id': args.client_id, + 'client_secret': args.client_secret} + + +def write_errors_in_screen(errors, fhand=sys.stderr): + for key, errors_by_type in errors.items(): + fhand.write(f'{key}\n') + fhand.write('-' * len(key) + '\n') + for error in errors_by_type: + if error.pk: + fhand.write(f'{error.pk}: ') + fhand.write(f'{error.message} - {error.code}\n') + fhand.write('\n') + + +def main(): + args = get_cmd_args() + out_fhand = sys.stdout + + client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2', + client_id=args['client_id'], + client_secret=args['client_secret'], + username=args['user'], + password=args['password']) + query = {"Query": [{"Index": 0, + "FieldName": "Collection accession number", + "Operation": "TextExactMatch", + "Value": args['accession_number']}], + "Expression": "Q0", + "DisplayStart": 0, + "DisplayLength": 10} + + result = client.search(STRAIN_WS, query=query) + total = result["total"] + if total == 0: + out_fhand.write('Accession not in database\n') + sys.exit(0) + return None + elif total == 1: + out_fhand.write('Accession is not duplicated\n') + sys.exit(0) + + print(f'Duplicates found: {total}. removing duplicates') + duplicated_ids = [record.record_id for record in result['records']] + for duplicated_id in duplicated_ids[:-1]: + client.delete_by_id(STRAIN_WS, duplicated_id) + + +if __name__ == '__main__': + main() diff --git a/bin/delete_mirri_data.py b/bin/delete_mirri_data.py new file mode 100644 index 0000000..92bffcb --- /dev/null +++ b/bin/delete_mirri_data.py @@ -0,0 +1,91 @@ +#!/usr/bin/env python3 +import argparse +import sys + +from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS +from mirri.io.parsers.mirri_excel import parse_mirri_excel +from mirri.validation.excel_validator import validate_mirri_excel + +SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' + + +def get_cmd_args(): + desc = "Upload strains to MIRRI-IS" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('-i', '--input', help='Validated Excel file', + type=argparse.FileType('rb'), required=True) + parser.add_argument('-v', '--spec_version', default='20200601', + help='Version of he specification of the given excel file') + parser.add_argument('-u', '--ws_user', help='Username of the web service', + required=True) + parser.add_argument('-p', '--ws_password', required=True, + help='Password of the web service user') + parser.add_argument('-c', '--client_id', required=True, + help='Client id of the web service') + parser.add_argument('-s', '--client_secret', required=True, + help='Client secret of the web service') + parser.add_argument('-f', '--force_update', required=False, + action='store_true', + help='Use it if you want to update the existing strains') + + args = parser.parse_args() + + return {'input_fhand': args.input, 'user': args.ws_user, + 'version': args.spec_version, + 'password': args.ws_password, 'client_id': args.client_id, + 'client_secret': args.client_secret, 'update': args.force_update} + + +def write_errors_in_screen(errors, fhand=sys.stderr): + for key, errors_by_type in errors.items(): + fhand.write(f'{key}\n') + fhand.write('-' * len(key) + '\n') + for error in errors_by_type: + if error.pk: + fhand.write(f'{error.pk}: ') + fhand.write(f'{error.message} - {error.code}\n') + fhand.write('\n') + + +def main(): + args = get_cmd_args() + input_fhand = args['input_fhand'] + spec_version = args['version'] + out_fhand = sys.stderr + error_log = validate_mirri_excel(input_fhand, version=spec_version) + errors = error_log.get_errors() + if errors: + write_errors_in_screen(errors, out_fhand) + sys.exit(1) + + input_fhand.seek(0) + parsed_objects = parse_mirri_excel(input_fhand, version=spec_version) + strains = list(parsed_objects['strains']) + growth_media = list(parsed_objects['growth_media']) + + client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2', + client_id=args['client_id'], + client_secret=args['client_secret'], + username=args['user'], + password=args['password']) + for gm in growth_media: + try: + client.delete_by_name(GROWTH_MEDIUM_WS, gm.acronym) + except ValueError as error: + print(error) + continue + print(f'Growth medium {gm.acronym} deleted') + + for strain in strains: + ws_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id) + if ws_strain is not None: + client.delete_by_id(STRAIN_WS, ws_strain.record_id) + print(f'Strain {strain.id.strain_id} deleted') + else: + print(f'Strain {strain.id.strain_id} not in database') + + +if __name__ == '__main__': + main() diff --git a/bin/upload_strains_to_mirri_is.py b/bin/upload_strains_to_mirri_is.py new file mode 100644 index 0000000..d6b7daf --- /dev/null +++ b/bin/upload_strains_to_mirri_is.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python3 +import argparse +import sys +from collections import Counter + +from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium +from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.io.parsers.mirri_excel import parse_mirri_excel +from mirri.validation.excel_validator import validate_mirri_excel + +TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' +PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri' + + +def get_cmd_args(): + desc = "Upload strains to MIRRI-IS" + parser = argparse.ArgumentParser(description=desc) + parser.add_argument('-i', '--input', help='Validated Excel file', + type=argparse.FileType('rb'), required=True) + parser.add_argument('-v', '--spec_version', default='20200601', + help='Version of he specification of the given excel file') + parser.add_argument('-u', '--ws_user', help='Username of the web service', + required=True) + parser.add_argument('-p', '--ws_password', required=True, + help='Password of the web service user') + parser.add_argument('-c', '--client_id', required=True, + help='Client id of the web service') + parser.add_argument('-s', '--client_secret', required=True, + help='Client secret of the web service') + parser.add_argument('--force_update', required=False, + action='store_true', + help='Use it if you want to update the existing strains') + parser.add_argument('--verbose', action='store_true', + help='use it if you want a verbose output') + parser.add_argument('--prod', action='store_true', + help='Use production server') + parser.add_argument('--dont_add_gm', action='store_false', + help="Don't add growth media", default=True) + parser.add_argument('--dont_add_strains', action='store_false', + help="Don't add growth media", default=True) + parser.add_argument('--skip_first_num', type=int, + help='skip first X strains to the tool') + + args = parser.parse_args() + + return {'input_fhand': args.input, 'user': args.ws_user, + 'version': args.spec_version, + 'password': args.ws_password, 'client_id': args.client_id, + 'client_secret': args.client_secret, 'update': args.force_update, + 'verbose': args.verbose, 'use_production_server': args.prod, + 'add_gm': args.dont_add_gm, 'add_strains': args.dont_add_strains, + 'skip_first_num': args.skip_first_num} + + +def write_errors_in_screen(errors, fhand=sys.stderr): + for key, errors_by_type in errors.items(): + fhand.write(f'{key}\n') + fhand.write('-' * len(key) + '\n') + for error in errors_by_type: + if error.pk: + fhand.write(f'{error.pk}: ') + fhand.write(f'{error.message} - {error.code}\n') + fhand.write('\n') + + +def create_or_upload_strains(client, strains, update=False, counter=None, + out_fhand=None, seek=None): + for index, strain in enumerate(strains): + if seek is not None and index < seek: + continue + # if strain.id.strain_id != 'CECT 5766': + # continue + result = get_or_create_or_update_strain(client, strain, update=update) + + new_strain = result['record'] + created = result['created'] + updated = result.get('updated', False) + if updated: + result_state = 'updated' + elif created: + result_state = 'created' + else: + result_state = 'not modified' + if counter is not None: + counter[result_state] += 1 + if out_fhand is not None: + out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n') + # break + + +def create_or_upload_growth_media(client, growth_media, update=False, counter=None, + out_fhand=None): + + for gm in growth_media: + result = get_or_create_or_update_growth_medium(client, gm, update) + + new_gm = result['record'] + created = result['created'] + updated = result.get('updated', False) + if updated: + result_state = 'updated' + elif created: + result_state = 'created' + else: + result_state = 'not modified' + if counter is not None: + counter[result_state] += 1 + if out_fhand is not None: + out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n') + + +def main(): + args = get_cmd_args() + input_fhand = args['input_fhand'] + spec_version = args['version'] + out_fhand = sys.stdout + error_log = validate_mirri_excel(input_fhand, version=spec_version) + errors = error_log.get_errors() + skip_first_num = args['skip_first_num'] + if errors: + write_errors_in_screen(errors, out_fhand) + sys.exit(1) + + input_fhand.seek(0) + parsed_objects = parse_mirri_excel(input_fhand, version=spec_version) + strains = list(parsed_objects['strains']) + growth_media = list(parsed_objects['growth_media']) + + server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL + + client = BiolomicsMirriClient(server_url=server_url, api_version='v2', + client_id=args['client_id'], + client_secret=args['client_secret'], + username=args['user'], + password=args['password'], + verbose=args['verbose']) + + if args['add_gm']: + client.start_transaction() + counter = Counter() + try: + create_or_upload_growth_media(client, growth_media, update=args['update'], + counter=counter, out_fhand=out_fhand) + except (Exception, KeyboardInterrupt) as error: + out_fhand.write('There were some errors in the Growth media upload\n') + out_fhand.write(str(error) + '\n') + out_fhand.write('Rolling back\n') + client.rollback() + raise + client.finish_transaction() + show_stats(counter, 'Growth Media', out_fhand) + + if args['add_strains']: + client.start_transaction() + counter = Counter() + try: + create_or_upload_strains(client, strains, update=args['update'], + counter=counter, + out_fhand=out_fhand, seek=skip_first_num) + client.finish_transaction() + except (Exception, KeyboardInterrupt) as error: + out_fhand.write('There were some errors in the Strain upload\n') + out_fhand.write(str(error) + '\n') + out_fhand.write('rolling back\n') + # client.rollback() + raise + client.finish_transaction() + show_stats(counter, 'Strains', out_fhand) + + +def show_stats(counter, kind, out_fhand): + out_fhand.write(f'{kind}\n') + line = ''.join(['-'] * len(kind)) + out_fhand.write(f"{line}\n") + for kind2, value in counter.most_common(5): + out_fhand.write(f'{kind2}: {value}\n') + out_fhand.write('\n') + + +if __name__ == '__main__': + main() diff --git a/bin/validate.py b/bin/validate.py new file mode 100644 index 0000000..86a10fc --- /dev/null +++ b/bin/validate.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +import sys +from pathlib import Path +from mirri.validation.excel_validator import validate_mirri_excel +import warnings +warnings.simplefilter("ignore") + + +def main(): + path = Path(sys.argv[1]) + error_log = validate_mirri_excel(path.open("rb")) + + for errors in error_log.get_errors().values(): + for error in errors: + print(error.pk, error.message, error.code) + + +if __name__ == "__main__": + main() diff --git a/docs/Error_Log_Style_Sheet.docx b/docs/Error_Log_Style_Sheet.docx new file mode 100644 index 0000000..0aa7af8 Binary files /dev/null and b/docs/Error_Log_Style_Sheet.docx differ diff --git a/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf b/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf new file mode 100644 index 0000000..4ebbc8b Binary files /dev/null and b/docs/ICT-TaskForce_HowToCompileTheSheets_v20200601.pdf differ diff --git a/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf b/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf new file mode 100644 index 0000000..2eda93b Binary files /dev/null and b/docs/ICT-TaskForce_RecommendationsToCollections_v20200601.pdf differ diff --git a/mirri/TODO.txt b/mirri/TODO.txt new file mode 100644 index 0000000..6c74222 --- /dev/null +++ b/mirri/TODO.txt @@ -0,0 +1,61 @@ + +Ontobiotope term. just one field in dataset, two fields in biolomics +Altitude. Field and in Coordinates + +Geographic origin: field and Entry in other table + +Ploidy: How is this field formated?haploid/diploid or 1, 2, 3... + +Best strategy: + +My class has + - strain data + - geographic data + - literature + - sequences + + + +No not a valid value for Strain from a registered collection, Allowed values: ?. no. yes +yes not a valid value for GMO, Allowed values: ?. No. Yes + +Organism Type: + firstuppercase in deposit + lower case in retrieve + +Taxon name is a list in retrieve + + +null values: +'Comment on taxonomy' = '' could be null +'Coordinates of geographic origin':{Longitude, lati... 'NaN' could be null +'Date of inclusion in the catalogue' = '' could be null +'Enzyme production'= '' could be null +'Ploidy':'?' could be null + +Deposit date + +-------------------------------------------- + +- Assign seq to strain in strain serializers +- Fields in ws that are not in our specification. What to do with them + - Type description - IGNORE + - Associated documents - IGNORE + - Data provided by - IGNORE + - Orders - IGNORE + - MTA text - IGNORE + - Catalog URL - + +- Publication RecordName assignation. How to do it? +- Sequence RecordName assignation. How to do it +- Publications serializer improvement +------------------------------------------------------ + +Marker Name. Which options are allowed in WS and how they map to the types in specifications? + +update: it should be done in the detail url. + +interspecific_hybrid set to "no" by default in web service if no value given. +Tested temperature growth range {'max': 0.0, 'min': 0.0} when added empty + +Very slow: A normal search action takes diff --git a/mirri/__init__.py b/mirri/__init__.py new file mode 100644 index 0000000..ba88d87 --- /dev/null +++ b/mirri/__init__.py @@ -0,0 +1,21 @@ +import functools + + +def rgetattr(obj, attr, *args): + + def _getattr(obj, attr): + return getattr(obj, attr, *args) + + return functools.reduce(_getattr, [obj] + attr.split('.')) + + +def rsetattr(obj, attr, val): + pre, _, post = attr.rpartition('.') + return setattr(rgetattr(obj, pre) if pre else obj, post, val) + +# using wonder's beautiful simplification: +# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427 + + +class ValidationError(Exception): + pass diff --git a/mirri/biolomics/__init__.py b/mirri/biolomics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/biolomics/pipelines/__init__.py b/mirri/biolomics/pipelines/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/biolomics/pipelines/growth_medium.py b/mirri/biolomics/pipelines/growth_medium.py new file mode 100644 index 0000000..9b5f8b1 --- /dev/null +++ b/mirri/biolomics/pipelines/growth_medium.py @@ -0,0 +1,44 @@ +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS +from mirri.entities.growth_medium import GrowthMedium +from mirri.biolomics.serializers.growth_media import get_growth_medium_record_name + + +def get_or_create_or_update_growth_medium(client: BiolomicsMirriClient, + growth_medium: GrowthMedium, + update=False): + response = get_or_create_growth_medium(client, growth_medium) + + new_gm = response['record'] + created = response['created'] + if created: + return {'record': new_gm, 'created': created, 'updated': False} + + if not update: + return {'record': new_gm, 'created': False, 'updated': False} + + # compare_strains + if growth_medium.is_equal(new_gm, exclude_fields=['record_id', 'record_name', 'acronym']): + records_are_different = False + else: + growth_medium.update(new_gm, include_fields=['record_id', 'record_name']) + records_are_different = True + + if records_are_different: + updated_gm = client.update(GROWTH_MEDIUM_WS, growth_medium) + updated = True + else: + updated_gm = new_gm + updated = False + return {'record': updated_gm, 'created': False, 'updated': updated} + + +def get_or_create_growth_medium(client: BiolomicsMirriClient, + growth_medium: GrowthMedium): + record_name = get_growth_medium_record_name(growth_medium) + gm = client.retrieve_by_name(GROWTH_MEDIUM_WS, record_name) + if gm is not None: + return {'record': gm, 'created': False} + + new_gm = client.create(GROWTH_MEDIUM_WS, growth_medium) + return {'record': new_gm, 'created': True} diff --git a/mirri/biolomics/pipelines/strain.py b/mirri/biolomics/pipelines/strain.py new file mode 100644 index 0000000..6a66f1c --- /dev/null +++ b/mirri/biolomics/pipelines/strain.py @@ -0,0 +1,122 @@ +from pprint import pprint +import deepdiff + +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS, SEQUENCE_WS, STRAIN_WS + +from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics +from mirri.biolomics.serializers.strain import StrainMirri +from mirri.entities.publication import Publication + + +def retrieve_strain_by_accession_number(client, accession_number): + query = {"Query": [{"Index": 0, + "FieldName": "Collection accession number", + "Operation": "TextExactMatch", + "Value": accession_number}], + "Expression": "Q0", + "DisplayStart": 0, + "DisplayLength": 10} + + result = client.search(STRAIN_WS, query=query) + total = result["total"] + if total == 0: + return None + elif total == 1: + return result["records"][0] + else: + msg = f"More than one entries for {accession_number} in database" + raise ValueError(msg) + + +def get_or_create_publication(client: BiolomicsMirriClient, pub: Publication): + new_pub = client.retrieve_by_name(BIBLIOGRAPHY_WS, pub.title) + + if new_pub is not None: + return {'record': new_pub, 'created': False} + new_pub = client.create(BIBLIOGRAPHY_WS, pub) + return {'record': new_pub, 'created': True} + + +def get_or_create_sequence(client: BiolomicsMirriClient, sequence: GenomicSequenceBiolomics): + seq = client.retrieve_by_name(SEQUENCE_WS, sequence.marker_id) + if seq is not None: + return {'record': seq, 'created': False} + + new_seq = client.create(SEQUENCE_WS, sequence) + return {'record': new_seq, 'created': True} + + +def get_or_create_or_update_strain(client: BiolomicsMirriClient, + record: StrainMirri, update=False): + response = get_or_create_strain(client, record) + new_record = response['record'] + created = response['created'] + + if created: + return {'record': new_record, 'created': True, 'updated': False} + + if not update: + return {'record': new_record, 'created': False, 'updated': False} + + if record.record_id is None: + record.record_id = new_record.record_id + if record.record_name is None: + record.record_name = new_record.record_name + if record.synonyms is None or record.synonyms == []: + record.synonyms = new_record.synonyms + + # compare_strains + # we exclude pub id as it is an internal reference of pub and can be changed + diffs = deepdiff.DeepDiff(new_record.dict(), record.dict(), + ignore_order=True, exclude_paths=None, + exclude_regex_paths=[r"root\[\'publications\'\]\[\d+\]\[\'id\'\]", + r"root\[\'publications\'\]\[\d+\]\[\'RecordId\'\]", + r"root\[\'genetics\'\]\[\'Markers\'\]\[\d+\]\[\'RecordId\'\]", + r"root\[\'genetics\'\]\[\'Markers\'\]\[\d+\]\[\'RecordName\'\]"]) + + if diffs: + pprint(diffs, width=200) + # pprint('en el que yo mando') + # pprint(record.dict()) + # pprint('lo que hay en db') + # pprint(new_record.dict()) + + records_are_different = True if diffs else False + if records_are_different: + updated_record = update_strain(client, record) + updated = True + else: + updated_record = record + updated = False + return {'record': updated_record, 'created': False, 'updated': updated} + + +def get_or_create_strain(client: BiolomicsMirriClient, strain: StrainMirri): + new_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id) + if new_strain is not None: + return {'record': new_strain, 'created': False} + + new_strain = create_strain(client, strain) + + return {'record': new_strain, 'created': True} + + +def create_strain(client: BiolomicsMirriClient, strain: StrainMirri): + for pub in strain.publications: + creation_response = get_or_create_publication(client, pub) + for marker in strain.genetics.markers: + creation_response = get_or_create_sequence(client, marker) + + new_strain = client.create(STRAIN_WS, strain) + return new_strain + + +def update_strain(client: BiolomicsMirriClient, strain: StrainMirri): + for pub in strain.publications: + creation_response = get_or_create_publication(client, pub) + for marker in strain.genetics.markers: + creation_response = get_or_create_sequence(client, marker) + + new_strain = client.update(STRAIN_WS, strain) + return new_strain + diff --git a/mirri/biolomics/remote/__init__.py b/mirri/biolomics/remote/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/biolomics/remote/biolomics_client.py b/mirri/biolomics/remote/biolomics_client.py new file mode 100644 index 0000000..0e94339 --- /dev/null +++ b/mirri/biolomics/remote/biolomics_client.py @@ -0,0 +1,210 @@ +from mirri.biolomics.remote.endoint_names import (SEQUENCE_WS, STRAIN_WS, + GROWTH_MEDIUM_WS, TAXONOMY_WS, + COUNTRY_WS, ONTOBIOTOPE_WS, + BIBLIOGRAPHY_WS) +from mirri.biolomics.remote.rest_client import BiolomicsClient +from mirri.biolomics.serializers.sequence import ( + serialize_to_biolomics as sequence_to_biolomics, + serialize_from_biolomics as sequence_from_biolomics) +from mirri.biolomics.serializers.strain import ( + serialize_to_biolomics as strain_to_biolomics, + serialize_from_biolomics as strain_from_biolomics) + +from mirri.biolomics.serializers.growth_media import ( + serialize_to_biolomics as growth_medium_to_biolomics, + serialize_from_biolomics as growth_medium_from_biolomics) +from mirri.biolomics.serializers.taxonomy import ( + serialize_from_biolomics as taxonomy_from_biolomics) +from mirri.biolomics.serializers.locality import ( + serialize_from_biolomics as country_from_biolomics) +from mirri.biolomics.serializers.ontobiotope import ( + serialize_from_biolomics as ontobiotope_from_biolomics) +from mirri.biolomics.serializers.bibliography import ( + serializer_from_biolomics as bibliography_from_biolomics, + serializer_to_biolomics as bibliography_to_biolomics +) +from pprint import pprint + + +class BiolomicsMirriClient: + _conf = { + SEQUENCE_WS: { + 'serializers': {'to': sequence_to_biolomics, + 'from': sequence_from_biolomics}, + 'endpoint': 'WS Sequences'}, + STRAIN_WS: { + 'serializers': {'to': strain_to_biolomics, + 'from': strain_from_biolomics}, + 'endpoint': 'WS Strains'}, + GROWTH_MEDIUM_WS: { + 'serializers': {'from': growth_medium_from_biolomics, + 'to': growth_medium_to_biolomics}, + 'endpoint': 'WS Growth media'}, + TAXONOMY_WS: { + 'serializers': {'from': taxonomy_from_biolomics}, + 'endpoint': 'WS Taxonomy'}, + COUNTRY_WS: { + 'serializers': {'from': country_from_biolomics}, + 'endpoint': 'WS Locality'}, + ONTOBIOTOPE_WS: { + 'serializers': {'from': ontobiotope_from_biolomics}, + 'endpoint': 'WS Ontobiotope'}, + BIBLIOGRAPHY_WS: { + 'serializers': {'from': bibliography_from_biolomics, + 'to': bibliography_to_biolomics}, + 'endpoint': 'WS Bibliography' + } + } + + def __init__(self, server_url, api_version, client_id, client_secret, username, + password, website_id=1, verbose=False): + _client = BiolomicsClient(server_url, api_version, client_id, + client_secret, username, password, + website_id=website_id, verbose=verbose) + + self.client = _client + self.schemas = self.client.get_schemas() + self.allowed_fields = self.client.allowed_fields + self._transaction_created_ids = None + self._in_transaction = False + self._verbose = verbose + + def _initialize_transaction_storage(self): + if self._in_transaction: + msg = 'Can not initialize transaction if already in a transaction' + raise RuntimeError(msg) + self._transaction_created_ids = [] + + def _add_created_to_transaction_storage(self, response, entity_name): + if not self._in_transaction: + msg = 'Can not add ids to transaction storage if not in a transaction' + raise RuntimeError(msg) + + id_ = response.json().get('RecordId', None) + if id_ is not None: + ws_endpoint_name = self._conf[entity_name]['endpoint'] + self._transaction_created_ids.insert(0, (ws_endpoint_name, id_)) + + def start_transaction(self): + self._initialize_transaction_storage() + self._in_transaction = True + + def finish_transaction(self): + self._in_transaction = False + self._transaction_created_ids = None + + def get_endpoint(self, entity_name): + return self._conf[entity_name]['endpoint'] + + def get_serializers_to(self, entity_name): + return self._conf[entity_name]['serializers']['to'] + + def get_serializers_from(self, entity_name): + return self._conf[entity_name]['serializers']['from'] + + def retrieve_by_name(self, entity_name, name): + endpoint = self.get_endpoint(entity_name) + serializer_from = self.get_serializers_from(entity_name) + response = self.client.find_by_name(endpoint, name=name) + if response.status_code == 404: + return None + elif response.status_code != 200: + raise ValueError(f"{response.status_code}: {response.text}") + + ws_entity = response.json() + + return None if ws_entity is None else serializer_from(ws_entity, + client=self) + + def retrieve_by_id(self, entity_name, _id): + endpoint = self.get_endpoint(entity_name) + serializer_from = self.get_serializers_from(entity_name) + response = self.client.retrieve(endpoint, record_id=_id) + if response.status_code == 404: + return None + elif response.status_code != 200: + raise ValueError(f"{response.status_code}: {response.text}") + + ws_entity = response.json() + + return serializer_from(ws_entity, client=self) + + def create(self, entity_name, entity): + endpoint = self.get_endpoint(entity_name) + serializer_to = self.get_serializers_to(entity_name) + serializer_from = self.get_serializers_from(entity_name) + data = serializer_to(entity, client=self) + response = self.client.create(endpoint, data=data) + if response.status_code == 200: + if self._in_transaction: + self._add_created_to_transaction_storage(response, entity_name) + return serializer_from(response.json(), client=self) + else: + msg = f"return_code: {response.status_code}. msg: {response.json()['errors']['Value']}" + raise RuntimeError(msg) + + def delete_by_id(self, entity_name, record_id): + endpoint = self.get_endpoint(entity_name) + response = self.client.delete(endpoint, record_id=record_id) + if response.status_code != 200: + error = response.json() + # msg = f'{error["Title"]: {error["Details"]}}' + raise RuntimeError(error) + + def delete_by_name(self, entity_name, record_name): + endpoint = self.get_endpoint(entity_name) + response = self.client.find_by_name(endpoint, record_name) + if response.status_code != 200: + error = response.json() + # msg = f'{error["Title"]: {error["Details"]}}' + raise RuntimeError(error) + try: + record_id = response.json()['RecordId'] + except TypeError: + raise ValueError(f'The given record_name {record_name} does not exists') + self.delete_by_id(entity_name, record_id=record_id) + + def search(self, entity_name, query): + endpoint = self.get_endpoint(entity_name) + serializer_from = self.get_serializers_from(entity_name) + response = self.client.search(endpoint, search_query=query) + if response.status_code != 200: + error = response.json() + # msg = f'{error["Title"]: {error["Details"]}}' + raise RuntimeError(error) + search_result = response.json() + # pprint(search_result) + result = {'total': search_result['TotalCount'], + 'records': [serializer_from(record, client=self) + for record in search_result['Records']]} + return result + + def update(self, entity_name, entity): + record_id = entity.record_id + if record_id is None: + msg = 'In order to update the record, you need the recordId in the entity' + raise ValueError(msg) + endpoint = self.get_endpoint(entity_name) + serializer_to = self.get_serializers_to(entity_name) + serializer_from = self.get_serializers_from(entity_name) + data = serializer_to(entity, client=self, update=True) + # print('update') + # pprint(entity.dict()) + # print(data) + # pprint(data, width=200) + response = self.client.update(endpoint, record_id=record_id, data=data) + if response.status_code == 200: + # print('receive') + # pprint(response.json()) + entity = serializer_from(response.json(), client=self) + # pprint(entity.dict()) + return entity + + else: + msg = f"return_code: {response.status_code}. msg: {response.text}" + raise RuntimeError(msg) + + def rollback(self): + self._in_transaction = False + self.client.rollback(self._transaction_created_ids) + self._transaction_created_ids = None diff --git a/mirri/biolomics/remote/endoint_names.py b/mirri/biolomics/remote/endoint_names.py new file mode 100644 index 0000000..73ad745 --- /dev/null +++ b/mirri/biolomics/remote/endoint_names.py @@ -0,0 +1,7 @@ +SEQUENCE_WS = 'sequence' +STRAIN_WS = 'strain' +GROWTH_MEDIUM_WS = 'growth_medium' +TAXONOMY_WS = 'taxonomy' +COUNTRY_WS = 'country' +ONTOBIOTOPE_WS = 'ontobiotope' +BIBLIOGRAPHY_WS = 'bibliography' \ No newline at end of file diff --git a/mirri/biolomics/remote/rest_client.py b/mirri/biolomics/remote/rest_client.py new file mode 100644 index 0000000..85b9274 --- /dev/null +++ b/mirri/biolomics/remote/rest_client.py @@ -0,0 +1,214 @@ +import time +import re +import sys + +import requests +from requests_oauthlib import OAuth2Session +from oauthlib.oauth2 import LegacyApplicationClient +from oauthlib.oauth2.rfc6749.errors import InvalidGrantError + +from mirri.entities.strain import ValidationError + + +class BiolomicsClient: + schemas = None + allowed_fields = None + + def __init__(self, server_url, api_version, client_id, client_secret, + username, password, website_id=1, verbose=False): + self._client_id = client_id + self._client_secret = client_secret + self._username = username + self._password = password + self._client = None + self.server_url = server_url + self._api_version = api_version + self._auth_url = self.server_url + "/connect/token" + self.access_token = None + self.website_id = website_id + self._verbose = verbose + self._schema = self.get_schemas() + + def get_access_token(self): + if self._client is None: + self._client = LegacyApplicationClient(client_id=self._client_id) + authenticated = False + else: + expires_at = self._client.token["expires_at"] + authenticated = expires_at > time.time() + if not authenticated: + oauth = OAuth2Session(client=self._client) + try: + token = oauth.fetch_token( + token_url=self._auth_url, + username=self._username, + password=self._password, + client_id=self._client_id, + client_secret=self._client_secret, + ) + except InvalidGrantError: + oauth.close() + raise + self.access_token = token["access_token"] + oauth.close() + return self.access_token + + def _build_headers(self): + self.get_access_token() + return { + "accept": "application/json", + "websiteId": str(self.website_id), + "Authorization": f"Bearer {self.access_token}", + } + + def get_detail_url(self, end_point, record_id, api_version=None): + # api_version = self._api_version if api_version is None else api_version + if api_version: + return "/".join([self.server_url, api_version, 'data', + end_point, str(record_id)]) + else: + return "/".join([self.server_url, 'data', end_point, str(record_id)]) + + def get_list_url(self, end_point): + return "/".join([self.server_url, 'data', end_point]) + # return "/".join([self.server_url, self._api_version, 'data', end_point]) + + def get_search_url(self, end_point): + return "/".join([self.server_url, self._api_version, 'search', end_point]) + + def get_find_by_name_url(self, end_point): + return "/".join([self.get_search_url(end_point), 'findByName']) + + def search(self, end_point, search_query): + self._check_end_point_exists(end_point) + header = self._build_headers() + url = self.get_search_url(end_point) + time0 = time.time() + response = requests.post(url, json=search_query, headers=header) + time1 = time.time() + if self._verbose: + sys.stdout.write(f'Search to {end_point} request time for {url}: {time1 - time0}\n') + return response + + def retrieve(self, end_point, record_id): + self._check_end_point_exists(end_point) + header = self._build_headers() + url = self.get_detail_url(end_point, record_id, api_version=self._api_version) + time0 = time.time() + response = requests.get(url, headers=header) + time1 = time.time() + if self._verbose: + sys.stdout.write(f'Get to {end_point} request time for {url}: {time1-time0}\n') + return response + + def create(self, end_point, data): + self._check_end_point_exists(end_point) + self._check_data_consistency(data, self.allowed_fields[end_point]) + header = self._build_headers() + url = self.get_list_url(end_point) + return requests.post(url, json=data, headers=header) + + def update(self, end_point, record_id, data): + self._check_end_point_exists(end_point) + self._check_data_consistency(data, self.allowed_fields[end_point], + update=True) + header = self._build_headers() + url = self.get_detail_url(end_point, record_id=record_id) + return requests.put(url, json=data, headers=header) + + def delete(self, end_point, record_id): + self._check_end_point_exists(end_point) + header = self._build_headers() + url = self.get_detail_url(end_point, record_id) + return requests.delete(url, headers=header) + + def find_by_name(self, end_point, name): + self._check_end_point_exists(end_point) + header = self._build_headers() + url = self.get_find_by_name_url(end_point) + response = requests.get(url, headers=header, params={'name': name}) + return response + + def get_schemas(self): + if self.schemas is None: + headers = self._build_headers() + url = self.server_url + '/schemas' + response = requests.get(url, headers=headers) + if response.status_code == 200: + self.schemas = response.json() + else: + raise ValueError(f"{response.status_code}: {response.text}") + if self.allowed_fields is None: + self.allowed_fields = self._process_schema(self.schemas) + return self.schemas + + @staticmethod + def _process_schema(schemas): + schema = schemas[0] + allowed_fields = {} + for endpoint_schema in schema['TableViews']: + endpoint_name = endpoint_schema['TableViewName'] + endpoint_values = endpoint_schema['ResultFields'] + fields = {field['title']: field for field in endpoint_values} + allowed_fields[endpoint_name] = fields + return allowed_fields + + def _check_end_point_exists(self, endpoint): + if endpoint not in self.allowed_fields.keys(): + raise ValueError(f'{endpoint} not a recognised endpoint') + + def _check_data_consistency(self, data, allowed_fields, update=False): + update_mandatory = set(['RecordDetails', 'RecordName', 'RecordId']) + if update and not update_mandatory.issubset(data.keys()): + msg = 'Updating data keys must be RecordDetails, RecordName and RecordId' + raise ValidationError(msg) + + if not update and set(data.keys()).difference(['RecordDetails', 'RecordName', 'Acronym']): + msg = 'data keys must be RecordDetails and RecordName or Acronym' + raise ValidationError(msg) + for field_name, field_value in data['RecordDetails'].items(): + if field_name not in allowed_fields: + raise ValidationError(f'{field_name} not in allowed fields') + + field_schema = allowed_fields[field_name] + self._check_field_schema(field_name, field_schema, field_value) + + @staticmethod + def _check_field_schema(field_name, field_schema, field_value): + if field_schema['FieldType'] != field_value['FieldType']: + msg = f"Bad FieldType ({field_value['FieldType']}) for {field_name}. " + msg += f"It should be {field_schema['FieldType']}" + raise ValidationError(msg) + + states = field_schema['states'] if 'states' in field_schema else None + if states: + states = [re.sub(r" *\(.*\)", "", s) for s in states] + + subfields = field_schema['subfields'] if 'subfields' in field_schema else None + if subfields is not None and states is not None: + subfield_names = [subfield['SubFieldName'] + for subfield in subfields if subfield['IsUsed']] + + for val in field_value['Value']: + if val['Name'] not in subfield_names: + msg = f"{field_name}: {val['Name']} not in {subfield_names}" + raise ValidationError(msg) + + if val['Value'] not in states: + + msg = f"{field_value['Value']} not a valid value for " + msg += f"{field_name}, Allowed values: {'. '.join(states)}" + raise ValidationError(msg) + + elif states is not None: + if field_value['Value'] not in states: + msg = f"{field_value['Value']} not a valid value for " + msg += f"{field_name}, Allowed values: {'. '.join(states)}" + raise ValidationError(msg) + + def rollback(self, created_ids): + for endpoint, id_ in created_ids: + try: + self.delete(end_point=endpoint, record_id=id_) + except Exception: + pass diff --git a/mirri/biolomics/serializers/__init__.py b/mirri/biolomics/serializers/__init__.py new file mode 100644 index 0000000..b51f976 --- /dev/null +++ b/mirri/biolomics/serializers/__init__.py @@ -0,0 +1,3 @@ +RECORD_ID = 'RecordId' +RECORD_NAME = 'RecordName' +RECORD_DETAILS = 'RecordDetails' diff --git a/mirri/biolomics/serializers/bibliography.py b/mirri/biolomics/serializers/bibliography.py new file mode 100644 index 0000000..2c36406 --- /dev/null +++ b/mirri/biolomics/serializers/bibliography.py @@ -0,0 +1,82 @@ +from typing import List + +from mirri import rgetattr +from mirri.entities.publication import Publication +from mirri.biolomics.settings import PUB_MIRRI_FIELDS + +RECORD_ID = 'RecordId' +RECORD_NAME = 'RecordName' + +PUB_MAPPING = { + # 'record_id': 'RecordId', + # 'record_name': 'RecordName', + 'strains': "Associated strains", + 'taxa': "Associated taxa", + 'authors': "Authors", + # 'sequneces': "Associated sequences", + # 'abstract': "Abstract", + # 'collection': "Collection", + 'doi': "DOI number", + 'editor': "Editor(s)", + # 'full_reference': "Full reference", + # 'link': "Hyperlink", + 'isbn': "ISBN", + 'issn': "ISSN", + 'issue': "Issue", + 'journal': "Journal", + 'journal_book': "Journal-Book", + # 'keywords': "Keywords", + 'first_page': "Page from", + 'last_page': "Page to", + 'publisher': "Publisher", + 'pubmed_id': "PubMed ID", + 'volume': "Volume", + 'year': "Year", +} +REV_PUB_MAPPING = {v: k for k, v in PUB_MAPPING.items()} + + +def serializer_from_biolomics(ws_data, client=None) -> Publication: + pub = Publication() + + pub.record_id = ws_data[RECORD_ID] + pub.record_name = ws_data[RECORD_NAME] + pub.title = ws_data[RECORD_NAME] + for field, value in ws_data['RecordDetails'].items(): + value = value['Value'] + if not value: + continue + attr = REV_PUB_MAPPING.get(field, None) + if not attr: + continue + if attr in ('year', 'first_page', 'last_page'): + value = int(value) + setattr(pub, attr, value) + return pub + + +def get_publication_record_name(publication): + if publication.record_name: + return publication.record_name + if publication.title: + return publication.title + if publication.pubmed_id: + return f'PUBMED:{publication.pubmed_id}' + if publication.doi: + return f'DOI:{publication.doi}' + + +def serializer_to_biolomics(publication: Publication, client=None, update=False): + ws_data = {} + if publication.record_id: + ws_data[RECORD_ID] = publication.record_id + ws_data[RECORD_NAME] = get_publication_record_name(publication) + details = {} + for attr, field in PUB_MAPPING.items(): + value = getattr(publication, attr, None) + if value is None: + continue + field_type = 'D' if attr == 'year' else "E" + details[field] = {'Value': value, 'FieldType': field_type} + ws_data['RecordDetails'] = details + return ws_data diff --git a/mirri/biolomics/serializers/growth_media.py b/mirri/biolomics/serializers/growth_media.py new file mode 100644 index 0000000..a0c93e5 --- /dev/null +++ b/mirri/biolomics/serializers/growth_media.py @@ -0,0 +1,66 @@ +from mirri.biolomics.serializers import RECORD_ID, RECORD_NAME, RECORD_DETAILS +from mirri.entities.growth_medium import GrowthMedium + + +def serialize_from_biolomics(ws_data, client=None) -> GrowthMedium: + medium = GrowthMedium() + medium.record_name = ws_data.get('RecordName', None) + medium.description = get_growth_medium_record_name(medium) + medium.record_id = ws_data.get('RecordId', None) + for key, value in ws_data['RecordDetails'].items(): + value = value['Value'] + if not value: + continue + + if key == "Full description": + medium.full_description = value + if key == "Ingredients": + medium.ingredients = value + if key == 'Medium description': + medium.description = value + if key == 'Other name': + medium.other_name= value + if key == 'pH': + medium.ph = value + if key == 'Sterilization conditions': + medium.sterilization_conditions = value + return medium + + +def get_growth_medium_record_name(growth_medium): + if growth_medium.record_name: + return growth_medium.record_name + if growth_medium.description: + return growth_medium.description + if growth_medium.acronym: + return growth_medium.acronym + + +GROWTH_MEDIUM_MAPPING = { + 'acronym': 'Acronym', + 'full_description': "Full description", + 'ingredients': "Ingredients", + 'description': 'Medium description', + 'other_name': 'Other name', + 'ph': 'pH', + 'sterilization_conditions': 'Sterilization conditions' +} + + +def serialize_to_biolomics(growth_medium: GrowthMedium, client=None, update=False): + ws_data = {} + if growth_medium.record_id: + ws_data[RECORD_ID] = growth_medium.record_id + record_name = get_growth_medium_record_name(growth_medium) + ws_data[RECORD_NAME] = record_name + details = {} + for field in growth_medium.fields: + if field in ('acronym', 'record_id', 'record_name'): + continue + value = getattr(growth_medium, field, None) + if value is not None: + details[GROWTH_MEDIUM_MAPPING[field]] = {'Value': value, 'FieldType': 'E'} + + ws_data[RECORD_DETAILS] = details + return ws_data + diff --git a/mirri/biolomics/serializers/locality.py b/mirri/biolomics/serializers/locality.py new file mode 100644 index 0000000..55c06ac --- /dev/null +++ b/mirri/biolomics/serializers/locality.py @@ -0,0 +1,26 @@ +from mirri.entities.location import Location + + +def serialize_from_biolomics(ws_data, client=None): + return ws_data + + +# this is a proof of concept +def serialize_location(location: Location): + fields = {} + if location.country: + fields['Country'] = {'Value': location.country, 'FieldType': 'E'} + if location.latitude and location.longitude: + value = {'Latitude': location.latitude, + 'Longitude': location.longitude} + if location.coord_uncertainty: + value['Precision'] = location.coord_uncertainty + fields['GIS position'] = {'FieldType': 'L', 'Value': value} + + fields['Strains'] = {"FieldType": "RLink", 'Value': [{ + 'Name': {'Value': None, 'FieldType': "E"}, + 'RecordId': None + }]} + + return {"RecordDetails": fields, + "RecordName": location.country} diff --git a/mirri/biolomics/serializers/ontobiotope.py b/mirri/biolomics/serializers/ontobiotope.py new file mode 100644 index 0000000..36358e8 --- /dev/null +++ b/mirri/biolomics/serializers/ontobiotope.py @@ -0,0 +1,2 @@ +def serialize_from_biolomics(ws_data, client=None): + return ws_data diff --git a/mirri/biolomics/serializers/sequence.py b/mirri/biolomics/serializers/sequence.py new file mode 100644 index 0000000..611f3c5 --- /dev/null +++ b/mirri/biolomics/serializers/sequence.py @@ -0,0 +1,81 @@ +from mirri.entities.sequence import GenomicSequence +from mirri.biolomics.serializers import RECORD_ID, RECORD_NAME, RECORD_DETAILS + + +class GenomicSequenceBiolomics(GenomicSequence): + def __init__(self, **kwargs): + super().__init__(freeze=False, **kwargs) + + @property + def record_id(self) -> int: + return self._data.get(RECORD_ID, None) + + @record_id.setter + def record_id(self, value: int): + self._data[RECORD_ID] = value + + @property + def record_name(self) -> str: + return self._data.get(RECORD_NAME, None) + + @record_name.setter + def record_name(self, value: str): + self._data[RECORD_NAME] = value + + def dict(self): + _data = super(GenomicSequenceBiolomics, self).dict() + if self.record_id: + _data[RECORD_ID] = self.record_id + if self.record_name: + _data[RECORD_NAME] = self.record_name + return _data + + +def serialize_to_biolomics(marker: GenomicSequenceBiolomics, client=None, update=False): + ws_sequence = {} + print() + if marker.record_id: + ws_sequence[RECORD_ID] = marker.record_id + if marker.record_name: + ws_sequence[RECORD_NAME] = marker.record_name + else: + ws_sequence[RECORD_NAME] = marker.marker_id + details = {} + if marker.marker_id: + details["INSDC number"] = {"Value": marker.marker_id, + "FieldType": "E"} + if marker.marker_seq: + details["DNA sequence"] = { + "Value": {"Sequence": marker.marker_seq}, + "FieldType": "N"} + if marker.marker_type: + details['Marker name'] = {"Value": marker.marker_type, "FieldType": "E"} + + ws_sequence[RECORD_DETAILS] = details + + return ws_sequence + + +MAPPING_WS_SPEC_TYPES = { + 'Beta tubulin': 'TUBB' +} + + +def serialize_from_biolomics(ws_data, client=None) -> GenomicSequenceBiolomics: + marker = GenomicSequenceBiolomics() + marker.record_id = ws_data[RECORD_ID] + marker.record_name = ws_data[RECORD_NAME] + + for key, value in ws_data['RecordDetails'].items(): + value = value['Value'] + if key == 'INSDC number' and value: + marker.marker_id = value + elif key == 'Marker name' and value: + kind = MAPPING_WS_SPEC_TYPES.get(value, None) + value = kind if kind else value + marker.marker_type = value + + elif key == 'DNA sequence' and 'Sequence' in value and value['Sequence']: + marker.marker_seq = value['Sequence'] + + return marker diff --git a/mirri/biolomics/serializers/strain.py b/mirri/biolomics/serializers/strain.py new file mode 100644 index 0000000..60415a7 --- /dev/null +++ b/mirri/biolomics/serializers/strain.py @@ -0,0 +1,462 @@ +import re +import sys +import pycountry + +from mirri import rgetattr, rsetattr +from mirri.entities.date_range import DateRange +from mirri.entities.strain import ORG_TYPES, OrganismType, StrainId, StrainMirri, add_taxon_to_strain +from mirri.biolomics.remote.endoint_names import (GROWTH_MEDIUM_WS, TAXONOMY_WS, + ONTOBIOTOPE_WS, BIBLIOGRAPHY_WS, SEQUENCE_WS, COUNTRY_WS) +from mirri.settings import ( + ALLOWED_FORMS_OF_SUPPLY, + NAGOYA_PROBABLY_SCOPE, + NAGOYA_NO_RESTRICTIONS, + NAGOYA_DOCS_AVAILABLE, + NO_RESTRICTION, + ONLY_RESEARCH, + COMMERCIAL_USE_WITH_AGREEMENT, +) +from mirri.biolomics.settings import MIRRI_FIELDS +from mirri.utils import get_pycountry + +NAGOYA_TRANSLATOR = { + NAGOYA_NO_RESTRICTIONS: "no known restrictions under the Nagoya protocol", + NAGOYA_DOCS_AVAILABLE: "documents providing proof of legal access and terms of use available at the collection", + NAGOYA_PROBABLY_SCOPE: "strain probably in scope, please contact the culture collection", +} +REV_NAGOYA_TRANSLATOR = {v: k for k, v in NAGOYA_TRANSLATOR.items()} + +RESTRICTION_USE_TRANSLATOR = { + NO_RESTRICTION: "no restriction apply", + ONLY_RESEARCH: "for research use only", + COMMERCIAL_USE_WITH_AGREEMENT: "for commercial development a special agreement is requested", +} + +REV_RESTRICTION_USE_TRANSLATOR = {v: k for k, + v in RESTRICTION_USE_TRANSLATOR.items()} + +DATE_TYPE_FIELDS = ("Date of collection", "Date of isolation", + "Date of deposit", "Date of inclusion in the catalogue") +BOOLEAN_TYPE_FIELDS = ("Strain from a registered collection", "Dual use", + "Quarantine in Europe", "Interspecific hybrid") # , 'GMO') +FILE_TYPE_FIELDS = ("MTA file", "ABS related files") +MAX_MIN_TYPE_FIELDS = ("Tested temperature growth range", + "Recommended growth temperature") +LIST_TYPES_TO_JOIN = ('Other denomination', 'Plasmids collections fields', 'Plasmids') + +MARKER_TYPE_MAPPING = { + '16S rRNA': 'Sequences 16s', # or Sequences c16S rRNA + 'ACT': 'Sequences ACT', + 'CaM': 'Sequences CaM', + 'EF-1α': 'Sequences TEF1a', + 'ITS': 'Sequences ITS', + 'LSU': 'Sequences LSU', + 'RPB1': 'Sequences RPB1', + 'RPB2': 'Sequences RPB2', + 'TUBB': 'Sequences TUB' # or Sequences Beta tubulin +} + + +def serialize_to_biolomics(strain: StrainMirri, client=None, update=False, + log_fhand=None): # sourcery no-metrics + if log_fhand is None: + log_fhand = sys.stdout + strain_record_details = {} + + for field in MIRRI_FIELDS: + try: + biolomics_field = field["biolomics"]["field"] + biolomics_type = field["biolomics"]["type"] + except KeyError: + # print(f'biolomics not configured: {field["label"]}') + continue + + label = field["label"] + attribute = field["attribute"] + value = rgetattr(strain, attribute, None) + if value is None: + continue + + if label == "Accession number": + value = f"{strain.id.collection} {strain.id.number}" + if label == "Restrictions on use": + value = RESTRICTION_USE_TRANSLATOR[value] + elif label == "Nagoya protocol restrictions and compliance conditions": + value = NAGOYA_TRANSLATOR[value] + elif label in FILE_TYPE_FIELDS: + value = [{"Name": "link", "Value": fname} for fname in value] + elif label == "Other culture collection numbers": + value = "; ".join(on.strain_id for on in value) if value else None + elif label in BOOLEAN_TYPE_FIELDS: + value = 'yes' if value else 'no' + elif label in 'GMO': + value = 'Yes' if value else 'No' + elif label == "Organism type": + org_types = [ot.name for ot in value] + value = [] + for ot in ORG_TYPES.keys(): + is_organism = "yes" if ot in org_types else "no" + value.append({"Name": ot, "Value": is_organism}) + elif label == 'Taxon name': + if client: + taxa = strain.taxonomy.long_name.split(';') + value = [] + for taxon_name in taxa: + taxon = get_remote_rlink(client, TAXONOMY_WS, + taxon_name) + if taxon: + value.append(taxon) + if not value: + msg = f'WARNING: {strain.taxonomy.long_name} not found in database' + log_fhand.write(msg + '\n') + # TODO: decide to raise or not if taxon not in MIRRI DB + #raise ValueError(msg) + + elif label in DATE_TYPE_FIELDS: + year = value._year + month = value._month or 1 + day = value._day or 1 + if year is None: + continue + value = f"{year}-{month:02}-{day:02}" + elif label == 'History of deposit': + value = " < ".join(value) + elif label in MAX_MIN_TYPE_FIELDS: + if isinstance(value, (int, float, str)): + _max, _min = float(value), float(value) + else: + _max, _min = float(value['max']), float(value['min']) + + content = {"MaxValue": _max, "MinValue": _min, + "FieldType": biolomics_type} + strain_record_details[biolomics_field] = content + continue + elif label in LIST_TYPES_TO_JOIN: + value = '; '.join(value) + # TODO: Check how to deal with crossrefs + elif label == "Recommended medium for growth": + if client is not None: + ref_value = [] + for medium in value: + ws_gm = client.retrieve_by_name(GROWTH_MEDIUM_WS, medium) + if ws_gm is None: + raise ValueError( + f'Can not find the growth medium: {medium}') + gm = {"Name": {"Value": medium, "FieldType": "E"}, + "RecordId": ws_gm.record_id} + ref_value.append(gm) + value = ref_value + else: + continue + + elif label == "Form of supply": + _value = [] + for form in ALLOWED_FORMS_OF_SUPPLY: + is_form = "yes" if form in value else "no" + _value.append({"Name": form, "Value": is_form}) + value = _value + # print(label, value), biolomics_field + elif label == "Coordinates of geographic origin": + value = {'Latitude': strain.collect.location.latitude, + 'Longitude': strain.collect.location.longitude} + precision = strain.collect.location.coord_uncertainty + if precision is not None: + value['Precision'] = precision + elif label == "Geographic origin": + if client is not None and value.country is not None: + country = get_pycountry(value.country) + if country is None: + log_fhand.write(f'WARNING: {value.country} Not a valida country code/name\n') + else: + _value = get_country_record(country, client) + if _value is None: # TODO: Remove this once the countries are added to the DB + msg = f'WARNING: {value.country} not in MIRRI DB' + log_fhand.write(msg + '\n') + #raise ValueError(msg) + else: + content = {"Value": [_value], "FieldType": "RLink"} + strain_record_details['Country'] = content + _value = [] + for sector in ('state', 'municipality', 'site'): + sector_val = getattr(value, sector, None) + if sector_val: + _value.append(sector_val) + value = "; ".join(_value) if _value else None + if value is None: + continue + + elif label == "Ontobiotope": + if client and value: + onto = get_remote_rlink(client, ONTOBIOTOPE_WS, value) + value = [onto] if onto is not None else None + elif label == 'Literature': + if client and value: + pub_rlinks = [] + for pub in value: + rlink = get_remote_rlink(client, BIBLIOGRAPHY_WS, pub.title) + if rlink: + pub_rlinks.append(rlink) + if pub_rlinks: + value = pub_rlinks + else: + continue + + elif label == '': + pass + + elif label == 'Ploidy': + value = _translate_polidy(value) + if value is not None: + content = {"Value": value, "FieldType": biolomics_type} + strain_record_details[biolomics_field] = content + + # if False: + # record_details["Data provided by"] = { + # "Value": strain.id.collection, "FieldType": "V"} + + #Markers + if client: + add_markers_to_strain_details(client, strain, strain_record_details) + + strain_structure = {"RecordDetails": strain_record_details} + if update: + strain_structure['RecordId'] = strain.record_id + strain_structure['RecordName'] = strain.record_name + else: + strain_structure["Acronym"] = "MIRRI" + + return strain_structure + + +def add_markers_to_strain_details(client, strain: StrainMirri, details): + for marker in strain.genetics.markers: + marker_name = marker.marker_id + marker_in_ws = client.retrieve_by_name(SEQUENCE_WS, marker_name) + if marker_in_ws is None: + print('Marker not in web service') + continue + marker_type = marker.marker_type + ws_marker = { + "Value": [{ + "Name": {"Value": marker_in_ws.record_name, + "FieldType": "E"}, + "RecordId": marker_in_ws.record_id + }], + "FieldType": "NLink" + } + if marker_in_ws.marker_seq: + ws_marker['Value'][0]["TargetFieldValue"] = { + "Value": {"Sequence": marker_in_ws.marker_seq}, + "FieldType": "N" + } + + details[MARKER_TYPE_MAPPING[marker_type]] = ws_marker + + +def get_remote_rlink(client, endpoint, record_name): + entity = client.retrieve_by_name(endpoint, record_name) + if entity: + # some Endpoints does not serialize the json into a python object yet + try: + record_name = entity.record_name + record_id = entity.record_id + except AttributeError: + record_name = entity["RecordName"] + record_id = entity["RecordId"] + return {"Name": {"Value": record_name, "FieldType": "E"}, + "RecordId": record_id} + + +def add_strain_rlink_to_entity(record, strain_id, strain_name): + field_strain = { + "FieldType": "RLink", + 'Value': [{ + 'Name': {'Value': strain_name, 'FieldType': "E"}, + 'RecordId': strain_id + }] + } + record['RecordDetails']['Strains'] = field_strain + return record + + +PLOIDY_TRANSLATOR = { + 0: 'Aneuploid', + 1: 'Haploid', + 2: 'Diploid', + 3: 'Triploid', + 4: 'Tetraploid', + 9: 'Polyploid' +} + +REV_PLOIDY_TRANSLATOR = {v: k for k, v in PLOIDY_TRANSLATOR.items()} + + +def _translate_polidy(ploidy): + # print('ploidy in serializer', ploidy) + try: + ploidy = int(ploidy) + except TypeError: + return '?' + try: + ploidy = PLOIDY_TRANSLATOR[ploidy] + except KeyError: + ploidy = 'Polyploid' + return ploidy + + +def serialize_from_biolomics(biolomics_strain, client=None): # sourcery no-metrics + strain = StrainMirri() + strain.record_id = biolomics_strain.get('RecordId', None) + strain.record_name = biolomics_strain.get('RecordName', None) + for field in MIRRI_FIELDS: + try: + biolomics_field = field["biolomics"]["field"] + except KeyError: + # print(f'biolomics not configured: {field["label"]}') + continue + + label = field["label"] + attribute = field["attribute"] + field_data = biolomics_strain['RecordDetails'].get(biolomics_field, None) + if field_data is None: + continue + is_empty = field_data.get('IsEmpty') + if is_empty: + continue + if biolomics_field in ('Tested temperature growth range', 'Recommended growth temperature'): + value = {'max': field_data.get('MaxValue', None), + 'min': field_data.get('MinValue', None)} + else: + value = field_data['Value'] + # if value in (None, '', [], {}, '?', 'Unknown', 'nan', 'NaN'): + # continue + + # print(label, attribute, biolomics_field, value) + + if label == 'Accession number': + number = strain.record_name + mirri_id = StrainId(number=number) + strain.synonyms = [mirri_id] + coll, num = value.split(' ', 1) + accession_number_id = StrainId(collection=coll, number=num) + strain.id = accession_number_id + continue + elif label == "Restrictions on use": + value = REV_RESTRICTION_USE_TRANSLATOR[value] + elif label == 'Nagoya protocol restrictions and compliance conditions': + value = REV_NAGOYA_TRANSLATOR[value] + elif label in FILE_TYPE_FIELDS: + value = [f['Value'] for f in value] + elif label == "Other culture collection numbers": + other_numbers = [] + for on in value.split(";"): + on = on.strip() + try: + collection, number = on.split(" ", 1) + except ValueError: + collection = None + number = on + _id = StrainId(collection=collection, number=number) + other_numbers.append(_id) + value = other_numbers + elif label in BOOLEAN_TYPE_FIELDS: + value = value == 'yes' + elif label == 'GMO': + value = value == 'Yes' + elif label == "Organism type": + organism_types = [OrganismType(item['Name']) for item in value if item['Value'] == 'yes'] + if organism_types: + value = organism_types + elif label in 'Taxon name': + value = ";".join([v['Name']['Value'] for v in value]) + add_taxon_to_strain(strain, value) + continue + + elif label in DATE_TYPE_FIELDS: + # date_range = DateRange() + value = DateRange().strpdate(value) + + elif label in ("Recommended growth temperature", + "Tested temperature growth range"): + if (value['max'] is None or value['max'] == 0 or + value['min'] is None and value['min'] == 0): + continue + elif label == "Recommended medium for growth": + value = [v['Name']['Value'] for v in value] + elif label == "Form of supply": + value = [item['Name'] for item in value if item['Value'] == 'yes'] + elif label in LIST_TYPES_TO_JOIN: + value = [v.strip() for v in value.split(";")] + elif label == "Coordinates of geographic origin": + if ('Longitude' in value and 'Latitude' in value and + isinstance(value['Longitude'], float) and + isinstance(value['Latitude'], float)): + strain.collect.location.longitude = value['Longitude'] + strain.collect.location.latitude = value['Latitude'] + if value['Precision'] != 0: + strain.collect.location.coord_uncertainty = value['Precision'] + continue + elif label == "Altitude of geographic origin": + value = float(value) + elif label == "Geographic origin": + strain.collect.location.site = value + continue + elif label == 'Ontobiotope': + try: + value = re.search("(OBT:[0-9]{5,7})", value[0]['Name']['Value']).group() + except (KeyError, IndexError, AttributeError): + continue + + elif label == 'Ploidy': + value = REV_PLOIDY_TRANSLATOR[value] + elif label == 'Literature': + if client is not None: + pubs = [] + for pub in value: + pub = client.retrieve_by_id(BIBLIOGRAPHY_WS, pub['RecordId']) + pubs.append(pub) + value = pubs + + + rsetattr(strain, attribute, value) + # fields that are not in MIRRI FIELD list + # country + if 'Country' in biolomics_strain['RecordDetails'] and biolomics_strain['RecordDetails']['Country']: + try: + country_name = biolomics_strain['RecordDetails']['Country']['Value'][0]['Name']['Value'] + country = get_pycountry(country_name) + country_3 = country.alpha_3 if country else None + except (IndexError, KeyError): + country_3 = None + if country_3: + strain.collect.location.country = country_3 + # Markers: + if client: + markers = [] + for marker_type, biolomics_marker in MARKER_TYPE_MAPPING.items(): + try: + marker_value = biolomics_strain['RecordDetails'][biolomics_marker]['Value'] + except KeyError: + continue + if not marker_value: + continue + + for marker in marker_value: + record_id = marker['RecordId'] + marker = client.retrieve_by_id(SEQUENCE_WS, record_id) + if marker is not None: + markers.append(marker) + if markers: + strain.genetics.markers = markers + + return strain + + +def get_country_record(country, client): + for attr in ('common_name', 'name', 'official_name'): + val = getattr(country, attr, None) + if val is not None: + _value = get_remote_rlink(client, COUNTRY_WS, val) + if _value is not None: + return _value + return None diff --git a/mirri/biolomics/serializers/taxonomy.py b/mirri/biolomics/serializers/taxonomy.py new file mode 100644 index 0000000..9d00120 --- /dev/null +++ b/mirri/biolomics/serializers/taxonomy.py @@ -0,0 +1,64 @@ + +from mirri.entities.strain import Taxonomy + +#TODO this is all wrong, needs deep revision + +class TaxonomyMirri(Taxonomy): + def __init__(self, **kwargs): + super().__init__(freeze=False, **kwargs) + + fields = ['record_id', 'record_name', 'acronym', 'full_description', + 'ingredients', 'description', 'other_name', 'ph', + 'sterilization_conditions'] + + def __init__(self, **kwargs): + self._data = {} + for field in self.fields: + if field in kwargs and kwargs['field'] is not None: + value = kwargs['field'] + setattr(self, field, value) + + def __setattr__(self, attr, value): + if attr == '_data': + super().__setattr__(attr, value) + return + if attr not in self.fields: + raise TypeError(f'{attr} not an allowed attribute') + self._data[attr] = value + + def __getattr__(self, attr): + if attr == '_data': + return super + if attr not in self.fields and attr != '_data': + raise TypeError(f'{attr} not an allowed attribute') + return self._data.get(attr, None) + + def dict(self): + return self._data + + +def serialize_from_biolomics(ws_data, client=None) -> TaxonomyMirri: + + return ws_data + medium = GrowthMedium() + medium.record_name = ws_data.get('RecordName', None) + medium.record_id = ws_data.get('RecordId', None) + for key, value in ws_data['RecordDetails'].items(): + value = value['Value'] + if not value: + continue + + if key == "Full description": + medium.full_description = value + if key == "Ingredients": + medium.ingredients = value + if key == 'Medium description': + medium.description = value + if key == 'Other name': + medium.other_name= value + if key == 'pH': + medium.ph = value + if key == 'Sterilization conditions': + medium.sterilization_conditions = value + + return medium diff --git a/mirri/biolomics/settings.py b/mirri/biolomics/settings.py new file mode 100644 index 0000000..85be1df --- /dev/null +++ b/mirri/biolomics/settings.py @@ -0,0 +1,373 @@ +try: + from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +except ImportError: + raise ImportError( + 'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD') + +MIRRI_FIELDS = [ + { + "attribute": "id", + "label": "Accession number", + "mandatory": True, + "biolomics": {"field": "Collection accession number", "type": "E"}, + }, + { + "attribute": "restriction_on_use", + "label": "Restrictions on use", + "mandatory": True, + "biolomics": {"field": "Restrictions on use", "type": "T"}, + }, + { + "attribute": "nagoya_protocol", + "label": "Nagoya protocol restrictions and compliance conditions", + "mandatory": True, + "biolomics": {"field": "Nagoya protocol restrictions and compliance conditions", "type": "T"}, + }, + { + "attribute": "abs_related_files", + "label": "ABS related files", + "mandatory": False, + "biolomics": {"field": "ABS related files", "type": "U"}, + }, + { + "attribute": "mta_files", + "label": "MTA file", + "mandatory": False, + "biolomics": {"field": "MTA files URL", "type": "U"}, + }, + { + "attribute": "other_numbers", + "label": "Other culture collection numbers", + "mandatory": False, + "biolomics": {"field": "Other culture collection numbers", "type": "E"}, + }, + { + "attribute": "is_from_registered_collection", + "label": "Strain from a registered collection", + "mandatory": False, + "biolomics": {"field": "Strain from a registered collection", "type": "T"}, + }, + { + "attribute": "risk_group", + "label": "Risk Group", + "mandatory": True, + "biolomics": {"field": "Risk group", "type": "T"}, + }, + { + "attribute": "is_potentially_harmful", + "label": "Dual use", + "mandatory": False, + "biolomics": {"field": "Dual use", "type": "T"}, + }, + { + "attribute": "is_subject_to_quarantine", + "label": "Quarantine in Europe", + "mandatory": False, + "biolomics": {"field": "Quarantine in Europe", "type": "T"}, + }, + { + "attribute": "taxonomy.organism_type", + "label": "Organism type", + "mandatory": True, + "biolomics": {"field": "Organism type", "type": "C"}, + }, + { + "attribute": "taxonomy.long_name", + "label": "Taxon name", + "mandatory": True, + "biolomics": {"field": "Taxon name", "type": "SynLink"}, + }, + { + "attribute": "taxonomy.infrasubspecific_name", + "label": "Infrasubspecific names", + "mandatory": False, + "biolomics": {"field": "Infrasubspecific names", "type": "E"}, + }, + { + "attribute": "taxonomy.comments", + "label": "Comment on taxonomy", + "mandatory": False, + "biolomics": {"field": "Comment on taxonomy", "type": "E"}, + }, + { + "attribute": "taxonomy.interspecific_hybrid", + "label": "Interspecific hybrid", + "mandatory": False, + "biolomics": {"field": "Interspecific hybrid", "type": "T"}, + }, + { + "attribute": "status", "label": "Status", "mandatory": False, + "biolomics": {"field": "Status", "type": "E"}, + }, + { + "attribute": "history", + "label": "History of deposit", + "mandatory": False, + "biolomics": {"field": "History", "type": "E"}, + }, + { + "attribute": "deposit.who", + "label": "Depositor", + "mandatory": False, + "biolomics": {"field": "Depositor", "type": "E"}, + }, + { + "attribute": "deposit.date", + "label": "Date of deposit", + "mandatory": False, + "biolomics": {"field": "Deposit date", "type": "H"}, + }, + { + "attribute": "catalog_inclusion_date", + "label": "Date of inclusion in the catalogue", + "mandatory": False, + "biolomics": {"field": "Date of inclusion in the catalogue", "type": "H"}, + }, + { + "attribute": "collect.who", + "label": "Collected by", + "mandatory": False, + "biolomics": {"field": "Collector", "type": "E"}, + }, + { + "attribute": "collect.date", + "label": "Date of collection", + "mandatory": False, + "biolomics": {"field": "Collection date", "type": "H"}, + }, + { + "attribute": "isolation.who", + "label": "Isolated by", + "mandatory": False, + "biolomics": {"field": "Isolator", "type": "E"}, + }, + { + "attribute": "isolation.date", + "label": "Date of isolation", + "mandatory": False, + "biolomics": {"field": "Isolation date", "type": "H"}, + }, + { + "attribute": "isolation.substrate_host_of_isolation", + "label": "Substrate/host of isolation", + "mandatory": False, + "biolomics": {"field": "Substrate of isolation", "type": "E"}, + }, + { + "attribute": "growth.tested_temp_range", + "label": "Tested temperature growth range", + "mandatory": False, + "biolomics": {"field": "Tested temperature growth range", "type": "S"}, + }, + { + "attribute": "growth.recommended_temp", + "label": "Recommended growth temperature", + "mandatory": True, + "biolomics": {"field": "Recommended growth temperature", "type": "S"}, + }, + { + "attribute": "growth.recommended_media", + "label": "Recommended medium for growth", + "mandatory": True, + "biolomics": {"field": "Recommended growth medium", "type": "RLink"}, + }, + { + "attribute": "form_of_supply", + "label": "Form of supply", + "mandatory": True, + "biolomics": {"field": "Form", "type": "C"}, + }, + { + "attribute": "other_denominations", + "label": "Other denomination", + "mandatory": False, + "biolomics": {"field": "Other denomination", "type": "E"}, + }, + { + # here we use latitude to check if there is data in some of the fields + "attribute": "collect.location.latitude", + "label": "Coordinates of geographic origin", + "mandatory": False, + "biolomics": {"field": "Coordinates of geographic origin", "type": "L"}, + }, + { + "attribute": "collect.location.altitude", + "label": "Altitude of geographic origin", + "mandatory": False, + "biolomics": {"field": "Altitude of geographic origin", "type": "D"}, + }, + { + "attribute": "collect.location", + "label": "Geographic origin", + "mandatory": True, + "biolomics": {"field": "Geographic origin", "type": "E"}, + }, + { + "attribute": "collect.habitat", + "label": "Isolation habitat", + "mandatory": False, + "biolomics": {"field": "Isolation habitat", "type": "E"}, + }, + # { + # "attribute": "collect.habitat_ontobiotope", + # "label": "Ontobiotope term for the isolation habitat", + # "mandatory": False, + # "biolomics": {"field": "Ontobiotope term for the isolation habitat", "type": "E"}, + # }, + { + "attribute": "collect.habitat_ontobiotope", + "label": "Ontobiotope", + "mandatory": False, + "biolomics": {"field": "Ontobiotope", "type": "RLink"}, + }, + { + "attribute": "genetics.gmo", "label": "GMO", "mandatory": False, + "biolomics": {"field": "GMO", "type": "V"}, + }, + { + "attribute": "genetics.gmo_construction", + "label": "GMO construction information", + "mandatory": False, + "biolomics": {"field": "GMO construction information", "type": "E"}, + }, + { + "attribute": "genetics.mutant_info", + "label": "Mutant information", + "mandatory": False, + "biolomics": {"field": "Mutant information", "type": "E"}, + }, + { + "attribute": "genetics.genotype", + "label": "Genotype", + "mandatory": False, + "biolomics": {"field": "Genotype", "type": "E"}, + }, + { + "attribute": "genetics.sexual_state", + "label": "Sexual state", + "mandatory": False, + "biolomics": {"field": "Sexual state", "type": "E"}, + }, + { + "attribute": "genetics.ploidy", + "label": "Ploidy", + "mandatory": False, + "biolomics": {"field": "Ploidy", "type": "T"}, + }, + { + "attribute": "genetics.plasmids", + "label": "Plasmids", + "mandatory": False, + "biolomics": {"field": "Plasmids", "type": "E"}, + }, + { + "attribute": "genetics.plasmids_in_collections", + "label": "Plasmids collections fields", + "mandatory": False, + "biolomics": {"field": "Plasmids collections fields", "type": "E"}, + }, + { + "attribute": "publications", + "label": "Literature", + "mandatory": False, + "biolomics": {"field": "Literature", "type": "RLink"}, + }, + { + "attribute": "pathogenicity", + "label": "Pathogenicity", + "mandatory": False, + "biolomics": {"field": "Pathogenicity", "type": "E"}, + }, + { + "attribute": "enzyme_production", + "label": "Enzyme production", + "mandatory": False, + "biolomics": {"field": "Enzyme production", "type": "E"}, + }, + { + "attribute": "production_of_metabolites", + "label": "Production of metabolites", + "mandatory": False, + "biolomics": {"field": "Metabolites production", "type": "E"}, + }, + { + "attribute": "applications", + "label": "Applications", + "mandatory": False, + "biolomics": {"field": "Applications", "type": "E"}, + }, + { + "attribute": "remarks", "label": "Remarks", "mandatory": False, + "biolomics": {"field": "Remarks", "type": "E"}, + }, + { + "attribute": "literature_linked_to_the_sequence_genome", + "label": "Literature linked to the sequence/genome", + "mandatory": False, + # "biolomics": {"field": "MTA files URL", "type": "U"}, + }, +] + + +PUB_MIRRI_FIELDS = [ + { + "attribute": "pub_id", "mandatory": False, + "biolomics": {"field": "", "type": "E"}, + }, + { + "attribute": "pubmed_id", "mandatory": False, + "biolomics": {"field": "PubMed ID", "type": "E"}, + }, + { + "attribute": "doi", "mandatory": False, + "biolomics": {"field": "DOI number", "type": "E"}, + }, + { + "attribute": "title", "mandatory": False, + "biolomics": {"field": "Title", "type": "E"}, + }, + { + "attribute": "authors", "mandatory": False, + "biolomics": {"field": "Authors", "type": "E"}, + }, + { + "attribute": "journal", "mandatory": False, + "biolomics": {"field": "Journal", "type": "E"}, + }, + { + "attribute": "volumen", "mandatory": False, + "biolomics": {"field": "Volume", "type": "E"}, + }, + { + "attribute": "issue", "mandatory": False, + "biolomics": {"field": "Issue", "type": "E"}, + }, + { + "attribute": "first_page", "mandatory": False, + "biolomics": {"field": "Page from", "type": "E"}, + }, + { + "attribute": "last_page", "mandatory": False, + "biolomics": {"field": "Page to", "type": "E"}, + }, + { + "attribute": "last_page", "label": "", "mandatory": False, + "biolomics": {"field": "", "type": "E"}, + }, + { + "attribute": "last_page", "label": "", "mandatory": False, + "biolomics": {"field": "", "type": "E"}, + }, + { + "attribute": "book_title", "label": "", "mandatory": False, + "biolomics": {"field": "Book title", "type": "E"}, + }, + { + "attribute": "publisher", "label": "", "mandatory": False, + "biolomics": {"field": "Publisher", "type": "E"}, + }, + { + "attribute": "editor", "label": "", "mandatory": False, + "biolomics": {"field": "Editor(s)", "type": "E"}, + }, +] diff --git a/mirri/data/ontobiotopes.csv b/mirri/data/ontobiotopes.csv new file mode 100644 index 0000000..934f7b6 --- /dev/null +++ b/mirri/data/ontobiotopes.csv @@ -0,0 +1,3603 @@ +ID Name +OBT:000977 abalone +OBT:000175 abdomen +OBT:003341 ABF pig +OBT:001394 abomasum +OBT:000475 abrasion +OBT:000476 abscess +OBT:003590 acetamide enriched soil +OBT:001388 Achillea millefolium +OBT:000176 acid environment +OBT:003034 acid hot spring +OBT:002880 acid mine drainage +OBT:002952 acid mine water +OBT:003178 acidified beer wort +OBT:002881 acido resistant +OBT:002882 acido sensitive +OBT:002883 acido tolerant +OBT:000177 acidophile +OBT:002884 activated carbon biofilter +OBT:002885 activated carbon fiber felt +OBT:002757 activated carbon filter +OBT:001856 activated sludge +OBT:000477 additive +OBT:000029 adherent +OBT:000178 adipocyte +OBT:003179 adolescent +OBT:001395 adult animal +OBT:003245 adult human +OBT:003180 adult tsetse fly +OBT:000179 aerobe +OBT:000180 aerobic bioreactor +OBT:000181 aerobic environment +OBT:000030 aerosol +OBT:000182 aerotactic +OBT:000183 aerotolerant +OBT:001852 Aeschynomene indica +OBT:000031 agar +OBT:000978 agranulocyte +OBT:000032 agricultural equipment +OBT:000033 agricultural input +OBT:000184 agricultural pest +OBT:000034 agricultural product +OBT:001857 agricultural soil +OBT:000035 agricultural species +OBT:000185 agricultural tool +OBT:000478 agricultural waste +OBT:003246 agricultural wastewater treatment plant +OBT:001396 aioli +OBT:000186 air +OBT:000979 air conditioning system +OBT:000980 air filter +OBT:000479 air treatment unit +OBT:000480 airplane +OBT:000481 airport terminal +OBT:001397 alcoholic drink +OBT:000981 ale yeast +OBT:003312 Alfalfa +OBT:000982 alfalfa silage +OBT:002758 algae +OBT:001398 algae and related product +OBT:001858 algae as food +OBT:002886 alkali resistant +OBT:002887 alkali sensitive +OBT:002888 alkali tolerant +OBT:000187 alkaline environment +OBT:003035 alkaline hotspring +OBT:001399 alkaline lake +OBT:000983 alkaline salt crust +OBT:003333 alkaline soda lake +OBT:000188 alkaliphile +OBT:001400 alligator +OBT:001389 Allium +OBT:000036 alloploid +OBT:002759 alluvial gravel aquifer +OBT:002953 almond +OBT:001401 almond and primary derivative thereof +OBT:001859 almond as food +OBT:003349 almond beverage +OBT:003373 almond milk +OBT:001402 almond tree +OBT:000482 alpha-hemolytic +OBT:000483 alpine soil +OBT:000484 alveolar epithelium +OBT:001860 amaranth +OBT:001403 amaranth and primary derivative thereof +OBT:003350 amaranth milk +OBT:003181 amasi +OBT:002954 amended soil +OBT:002319 American cheese +OBT:002760 aminoglycosides resistant +OBT:002955 ammonia oxidizing +OBT:000485 amniotic fluid +OBT:000486 amoebas +OBT:002761 amoxicillin resistant +OBT:002956 amphibian +OBT:001404 amphibian and product thereof +OBT:002889 amphipod +OBT:002762 ampicillin resistant +OBT:002763 ampicillin sensitive +OBT:000189 anaerobe +OBT:000190 anaerobic bioreactor +OBT:000487 anaerobic dechlorinating bioreactor +OBT:002890 anaerobic digester +OBT:001861 anaerobic digester sludge +OBT:000191 anaerobic environment +OBT:001405 anaerobic mud +OBT:001406 anaerobic sediment +OBT:001862 anaerobic sewage sludge +OBT:000488 anaerobic sludge blanket reactor +OBT:002957 anaerobic wastewater digester +OBT:003036 anammox +OBT:002334 anchovie meat +OBT:000192 anemotactic +OBT:000037 aneuploid +OBT:002335 anglerfish meat +OBT:000193 animal +OBT:000984 animal based juice +OBT:001863 animal blood +OBT:001407 animal blood and product thereof +OBT:001864 animal bone +OBT:001408 animal bone and product thereof +OBT:001409 animal bone marrow and product thereof +OBT:001865 animal brain +OBT:001410 animal brain and product thereof +OBT:002668 animal commensal +OBT:003182 animal facultative symbiont +OBT:000194 animal farm +OBT:000038 animal feed +OBT:002336 animal filarial nematode +OBT:001866 animal foot +OBT:001411 animal foot and product thereof +OBT:000003 animal habitat +OBT:001867 animal head +OBT:001412 animal head and product thereof +OBT:001868 animal heart +OBT:001413 animal heart and product thereof +OBT:000195 animal hosted +OBT:000004 animal husbandry and agricultural habitat +OBT:001869 animal kidney +OBT:001414 animal kidney and product thereof +OBT:001870 animal liver +OBT:001415 animal liver and product thereof +OBT:001416 animal manure +OBT:001417 animal marrowbone +OBT:001871 animal neck +OBT:001418 animal neck and product thereof +OBT:003183 animal obligate symbiont +OBT:002764 animal opportunist +OBT:000039 animal part +OBT:002669 animal pathogen +OBT:000489 animal product and primary derivative thereof +OBT:001872 animal roe +OBT:001419 animal roe and product thereof +OBT:001873 animal skin +OBT:001420 animal skin and product thereof +OBT:001874 animal stomach +OBT:001421 animal stomach and product thereof +OBT:003125 animal symbiont +OBT:001875 animal tail +OBT:001422 animal tail and product thereof +OBT:000196 animal tissue +OBT:001876 animal tongue +OBT:001423 animal tongue and product thereof +OBT:000985 animal waste +OBT:000490 animal with age or sex property +OBT:000491 animal with disease +OBT:000986 animal with life stage property +OBT:000987 annelid +OBT:000197 anoxic environment +OBT:002670 anoxic river sediment +OBT:001877 anoxic sewage sludge +OBT:003037 anoxic water +OBT:003081 anoxic zone of freshwater lake +OBT:000040 anoxygenic +OBT:001424 ant +OBT:000492 antacid suspension +OBT:002765 antagonist +OBT:001878 antelope +OBT:001425 anther +OBT:001426 anther part +OBT:000493 anthosphere +OBT:000494 anthosphere part +OBT:002671 antibacterial resistant +OBT:002672 antibacterial sensitive +OBT:002673 antibacterial tolerant +OBT:002674 antibiotic resistant +OBT:002675 antibiotic sensitive +OBT:002676 antibiotic tolerant +OBT:000041 antibiotic-containing media +OBT:002677 antifungal resistant +OBT:002678 antifungal sensitive +OBT:002679 antifungal tolerant +OBT:000042 antimicrobial activity +OBT:002337 antimicrobial resistant +OBT:002338 antimicrobial sensitive +OBT:002339 antimicrobial tolerant +OBT:002680 antiviral resistant +OBT:002681 antiviral sensitive +OBT:002682 antiviral tolerant +OBT:002683 antiyeast resistant +OBT:002684 antiyeast sensitive +OBT:002685 antiyeast tolerant +OBT:000988 anus +OBT:001427 aorta +OBT:001428 aphid +OBT:002340 aphotic zone +OBT:000043 apiary +OBT:001429 appendix +OBT:002958 apple +OBT:001879 apple and primary derivative thereof +OBT:002341 apple as food +OBT:003548 apple cider +OBT:003575 apple juice +OBT:001880 apple tree +OBT:001881 apricot and primary derivative thereof +OBT:002342 apricot as food +OBT:002891 apron +OBT:000044 aquaculture equipment +OBT:000045 aquaculture farm +OBT:000005 aquaculture habitat +OBT:000046 aquaculture pond +OBT:000198 aquarium +OBT:000047 aquatic environment +OBT:000199 aquatic eukaryotic species +OBT:002686 aquatic plant +OBT:001430 aquatic sediment +OBT:000495 aqueous humour +OBT:002687 aquifer +OBT:003484 aquifer contaminated with unleaded gasoline +OBT:001431 aquifer sediment +OBT:001390 Arabidopsis +OBT:001853 Arabidopsis thaliana +OBT:002343 arable soil +OBT:000989 arachnid +OBT:002344 arctic marine sediment +OBT:000048 area with climate property +OBT:000049 area with epidemiologic property +OBT:001432 arm +OBT:001882 armadillo +OBT:001433 aromatic product and primary derivative thereof +OBT:001434 arsenate treated wood +OBT:003126 arsenic contaminated-soil +OBT:002959 arsenic oxidizing +OBT:003485 arsenic-rich aquifer +OBT:003038 arsenite oxidizing +OBT:000990 artery +OBT:000496 arthropod +OBT:000200 arthropod part +OBT:000991 artic valley +OBT:002345 artichoke +OBT:001883 artichoke and related product +OBT:000006 artificial environment +OBT:000050 artificial water environment +OBT:000201 artificial water structure +OBT:001884 arugula +OBT:000992 ascidian +OBT:000051 asexual reproduction +OBT:000993 ash dump +OBT:003495 Asiago +OBT:002346 asparagus +OBT:001885 aspargus and related product +OBT:000972 Asteraceae +OBT:000497 atherosclerotic lesion +OBT:000052 atmosphere part +OBT:002347 aubergine +OBT:001886 aubergine and related product +OBT:000498 auricular prosthesis +OBT:000202 automated teller machine +OBT:000053 autoploid +OBT:000499 autotroph +OBT:000203 auxotroph +OBT:001435 avian crop +OBT:002960 avocado +OBT:001887 avocado and primary derivative thereof +OBT:002348 avocado as food +OBT:002349 baboon +OBT:003184 baby +OBT:002350 baby corn +OBT:001888 baby corn and related product +OBT:000994 baby diaper +OBT:000995 baby wipe +OBT:000204 back +OBT:000996 backswamp +OBT:000997 backwater +OBT:002351 bacon +OBT:000500 bacteriocyte +OBT:000998 bacteriome +OBT:001889 baked food +OBT:001436 bakery product +OBT:000999 baking powder +OBT:001000 baking soda +OBT:003082 baking yeast +OBT:003371 Balinese traditional fermented sausage +OBT:001001 bamboo +OBT:002352 bamboo shoot +OBT:001890 bamboo shoot and related product +OBT:002961 banana +OBT:001891 banana and primary derivative thereof +OBT:002353 banana as food +OBT:001892 banana tree +OBT:000205 bandage +OBT:000206 banknote +OBT:002354 barb meat +OBT:001002 bark +OBT:001893 barley and primary derivative thereof +OBT:003405 barley beverage +OBT:000501 barley feed +OBT:003442 barley milk +OBT:002892 barley plant +OBT:002355 barley product +OBT:000207 barotactic +OBT:002356 barracuda meat +OBT:001003 barrier flat +OBT:002357 basil +OBT:001894 basil and related product +OBT:001437 basophil +OBT:001895 bat +OBT:000502 bathroom +OBT:000208 bathroom equipment +OBT:000503 bathroom sink +OBT:000504 bathtub +OBT:000505 bay +OBT:001438 beach mud +OBT:001896 beach sand +OBT:000209 beak +OBT:001897 bean +OBT:001439 bean and related product +OBT:001898 bear +OBT:003465 Beaufort +OBT:002358 beaver +OBT:001440 bechamel sauce +OBT:000506 bed as furniture +OBT:000507 bed sheet +OBT:000508 bedroom +OBT:000210 bedroom equipment +OBT:003039 bedside carafe +OBT:003342 bedside water bottle +OBT:000509 bedspread +OBT:001441 bee +OBT:003040 bee pollen +OBT:001899 beef +OBT:003549 beef cattle +OBT:000510 beef farm +OBT:002359 beefsteak +OBT:003541 beer +OBT:003550 beer Shava +OBT:003127 beer wort +OBT:000511 beer yeast +OBT:002360 beer-bottling plant +OBT:002962 beeswax +OBT:003576 beet juice +OBT:002361 beetroot +OBT:001900 beetroot and related product +OBT:001004 benzene-contaminated site +OBT:002963 berry +OBT:001901 berry and primary derivative thereof +OBT:001442 berry and small fruit and primary derivative thereof +OBT:002362 berry as food +OBT:000512 beta-hemolytic +OBT:000513 big colony +OBT:000514 bile +OBT:000211 bile resistant +OBT:000515 biliary tract +OBT:001443 bio clean room +OBT:000516 biofertilizer +OBT:000054 biofilm +OBT:001444 biofilm forming +OBT:000212 biofilm in natural environment +OBT:001005 biofilm phenotype +OBT:002766 biofilter +OBT:000055 bioluminescent +OBT:000213 biomat +OBT:000056 bioreactor +OBT:000517 biotrophic +OBT:001445 birch +OBT:001446 bird +OBT:001006 bird and reptile GIT part +OBT:001447 bird meat +OBT:001448 birria +OBT:001902 biscuit +OBT:003551 bison +OBT:002363 bison meat +OBT:003247 bite +OBT:001449 black anoxic freshwater mud +OBT:002364 black anoxic marine sediment +OBT:002365 black pepper +OBT:001007 black pigmented +OBT:002366 black pudding +OBT:001450 black sediment +OBT:002767 black smoker +OBT:002768 black smoker chimney +OBT:002964 blackberry +OBT:001903 blackberry and primary derivative thereof +OBT:002367 blackberry as food +OBT:001904 blackcurrant and primary derivative thereof +OBT:002368 blackcurrant as food +OBT:000518 bladder +OBT:000519 bladder stone +OBT:001451 blade +OBT:001905 blanched food +OBT:001452 blanket bog peat +OBT:000520 blender +OBT:003496 Bleu +OBT:000521 blood +OBT:000522 blood meal +OBT:000523 blood plasma +OBT:001906 blood sausage +OBT:000524 blood serum +OBT:000525 blood vessel +OBT:001453 blood-feeding insect +OBT:000057 bloom forming +OBT:001008 blue pigmented +OBT:003486 blue veined cheese +OBT:003497 Blue Wensleydate +OBT:002965 blueberry +OBT:001907 blueberry and primary derivative thereof +OBT:002369 blueberry as food +OBT:003041 boar +OBT:003083 bobtail squid +OBT:000214 body +OBT:000215 body fluid +OBT:000216 body surface +OBT:001009 bog +OBT:001908 boiled food +OBT:001010 boiler +OBT:001011 bone +OBT:001012 bone caries +OBT:000526 bone fracture +OBT:002688 bone marrow +OBT:000527 bone meal +OBT:000528 bone-anchored prosthesis +OBT:002370 bonito meat +OBT:000217 boot swab +OBT:001454 borax leachate +OBT:001455 borehole +OBT:001013 botanical garden soil +OBT:001456 bottle +OBT:001909 bottled water +OBT:001910 bottling factory +OBT:001911 bouillabaisse +OBT:003542 bovine +OBT:001912 bovine milk +OBT:000058 bovine serum +OBT:003248 boy +OBT:001457 brackish pond +OBT:002371 brackish water +OBT:002689 brain +OBT:001014 brain abcess +OBT:001458 bran +OBT:001015 branch +OBT:001016 brasserie +OBT:001391 Brassica +OBT:001459 brassica vegetable +OBT:001913 bread +OBT:001914 bread pre mix +OBT:002690 bread wheat product +OBT:001460 breakfast cereal +OBT:001017 breast +OBT:001018 breast milk +OBT:000059 breeding site +OBT:002372 brewery +OBT:002893 brewery wastewater +OBT:003466 Brick cheese +OBT:003498 Brie +OBT:002769 brine +OBT:002691 brine pool +OBT:003406 brined cheese +OBT:002373 brioche +OBT:001461 brisket saw +OBT:003547 Bristish beer +OBT:003443 brocciu +OBT:002374 broccoli +OBT:001915 broccoli and related product +OBT:003278 brocoli +OBT:003334 broiler chicken +OBT:002692 broiler meat strip +OBT:000529 broncho-pulmonary segment +OBT:000530 bronchus +OBT:002693 brown dog tick +OBT:003249 brown mushroom +OBT:001019 brown pigmented +OBT:002658 Brugia malayi +OBT:003279 brussel sprout +OBT:001916 brussel sprout and related product +OBT:002375 brussel sprout as food +OBT:002894 bryozoan +OBT:001020 buccal +OBT:003499 Buche de chèvre +OBT:001917 buckwheat +OBT:001462 buckwheat and primary derivative thereof +OBT:001463 bud +OBT:002376 buegill sunfish meat +OBT:003552 buffalo +OBT:001918 buffalo milk +OBT:000531 buffet +OBT:001464 bug +OBT:000532 building construction and demolition waste +OBT:001465 bulb vegetable +OBT:001021 bulbous plant +OBT:003243 Bulgarian yogurt +OBT:000533 bulk soil +OBT:000534 bulk tank +OBT:003571 bull +OBT:001919 bumblebee +OBT:001920 bun +OBT:001466 burger +OBT:000535 burn +OBT:002377 burnt soil +OBT:003313 bus driver +OBT:001467 butcher's knife +OBT:001468 butter +OBT:001469 buttermilk +OBT:003280 cabbage +OBT:003444 caciocavallo +OBT:003467 Caciocavallo +OBT:001470 caecal content +OBT:001471 caecum +OBT:001023 cafè +OBT:001022 cafeteria +OBT:003084 cake +OBT:001921 cake pre mix +OBT:002378 calcareous ooze +OBT:001024 calcereous rock +OBT:003577 calf +OBT:001922 calf barn +OBT:001025 calorifier +OBT:001923 camel +OBT:001924 camel milk +OBT:003500 Camembert +OBT:001472 can +OBT:001925 canal of root filled tooth +OBT:002320 Cancoillotte +OBT:001926 candied food +OBT:000536 canker +OBT:001927 canned fish +OBT:001473 canned food +OBT:001928 canned meat +OBT:003128 canned olive +OBT:001929 canned seafood +OBT:001930 canning factory +OBT:001026 canopy humus +OBT:003468 Cantal +OBT:002966 cantaloupe +OBT:002770 cantaloupe as food +OBT:001931 cantaloupe rind +OBT:001932 caramel +OBT:002379 carbonated bottled water +OBT:000218 carboxydotroph +OBT:000537 carcass +OBT:002380 cardoon +OBT:001933 cardoon and related product +OBT:001027 cargo oil tank +OBT:000538 caries +OBT:002381 carp meat +OBT:003085 carpaccio +OBT:000219 carpet +OBT:000539 carpet floor +OBT:000220 carpet tile +OBT:000540 carposphere +OBT:000541 carposphere part +OBT:001474 carr +OBT:003501 Carrè de l'est +OBT:000542 carrier +OBT:002382 carrot +OBT:001934 carrot and related product +OBT:003578 carrot juice +OBT:001475 cashew and primary derivative thereof +OBT:001935 cashew apple +OBT:003351 cashew beverage +OBT:003374 cashew milk +OBT:001936 cashew seed +OBT:003086 cat +OBT:000060 catalase activity +OBT:000221 catalase negative +OBT:000222 catalase positive +OBT:002967 catfish +OBT:002383 catfish meat +OBT:000543 catfish pond +OBT:000223 catheter +OBT:003532 cattle +OBT:001476 cattle dipping +OBT:003129 cattle waste +OBT:000224 cattle-dipping vat +OBT:001477 cattle-farm compost +OBT:003487 cauliflower +OBT:001937 cauliflower and related product +OBT:002384 cauliflower as food +OBT:000544 caulosphere +OBT:000545 caulosphere part +OBT:000225 cave +OBT:003375 caviar +OBT:000226 ceiling tile +OBT:002385 celeriac +OBT:001938 celeriac and related product +OBT:002386 celery +OBT:001939 celery and related product +OBT:003579 celery juice +OBT:002387 celery leaf +OBT:001940 celery leaf and related product +OBT:000061 cell +OBT:000062 cell culture +OBT:000227 cell sediment +OBT:000546 cellar +OBT:000228 cellular shape phenotype +OBT:000547 cellulose +OBT:000548 cellulosic substrate +OBT:001028 central nervous system +OBT:003250 cep +OBT:001941 cep and related product +OBT:002968 cephalopod +OBT:001029 cereal and pseudo-cereal dough-based product +OBT:001478 cereal bar +OBT:003087 cereal based drink +OBT:002771 cereal crop +OBT:000229 cereal feed +OBT:000549 cereal flours feed +OBT:001479 cereal grain and primary derivative thereof +OBT:000550 cerebrospinal fluid +OBT:000973 Cerrado +OBT:000551 cervix +OBT:001942 ceviche +OBT:000552 chained +OBT:000553 chair +OBT:000554 chapel +OBT:002388 chard +OBT:001943 chard and related product +OBT:003469 Cheddar +OBT:001480 cheese +OBT:001944 cheese brine +OBT:003251 cheese factory +OBT:001481 cheese rind +OBT:001482 cheese smear +OBT:001030 cheese starter culture +OBT:001945 cheeseburger +OBT:003185 cheesecake +OBT:001031 chemical plant +OBT:001483 chemical weapons factory +OBT:002389 chemically stressed soil +OBT:003281 chemo litho autotroph +OBT:002390 chemocline +OBT:002895 chemoheterotroph +OBT:003445 chemolithoheterotroph +OBT:002896 chemolithotroph +OBT:003282 chemoorganoheterotroph +OBT:000230 chemotactic +OBT:000555 chemotroph +OBT:002969 cherry +OBT:001946 cherry and primary derivative thereof +OBT:002391 cherry as food +OBT:002392 cherry tomato +OBT:000556 chest +OBT:001947 chewing gum +OBT:001032 chewing stick +OBT:002393 chhena +OBT:001948 chia seed +OBT:001484 chia seed and primary derivative thereof +OBT:003580 chibwantu +OBT:003314 chicken +OBT:001485 chicken coop +OBT:003186 chicken faeces +OBT:001949 chicken house +OBT:003130 chicken manure +OBT:002394 chicken meat +OBT:003187 chicken yard waste +OBT:003407 chief sourdough +OBT:003188 child +OBT:002395 chili pepper +OBT:001950 chili pepper and related product +OBT:003352 chili sauce +OBT:002396 chimpanzee +OBT:002397 chinchilla +OBT:002321 Chinese cabbage +OBT:001951 chinese cabbage and related product +OBT:001033 chipboard factory +OBT:001952 chipolata +OBT:002398 chive +OBT:001953 chive and related product +OBT:002772 chloramphenicol resistant +OBT:001034 chlorine-contaminated site +OBT:003488 chloroethene-contaminated aquifer +OBT:003408 chlorophenol-contaminated groundwater +OBT:001035 chlorophototroph +OBT:003252 chloropicrine-enriched soil +OBT:001036 chocolate product +OBT:001486 chopstick +OBT:003253 chorizo +OBT:003131 choux pastry +OBT:003189 chromate contaminated soil +OBT:000557 chyle +OBT:000558 chyme +OBT:003543 cider +OBT:003190 cinnamon +OBT:001954 cinnamon and related product +OBT:002773 ciprofloxacin resistant +OBT:002774 ciprofloxacin sensitive +OBT:002775 ciprofloxacin tolerant +OBT:000559 circulatory system +OBT:000231 circulatory system part +OBT:002970 cis-dichloroethene contaminated sediment +OBT:002399 citronella grass +OBT:001955 citronella grass and related product +OBT:002971 citrus fruit +OBT:001487 citrus fruit and primary derivative thereof +OBT:001956 citrus tree +OBT:000232 city +OBT:003088 clam juice +OBT:001037 clay +OBT:001038 clean room +OBT:002694 clinic +OBT:001488 cloaca +OBT:000560 clothe +OBT:000233 cloud +OBT:000234 cloud water +OBT:002400 clove +OBT:001957 clove and related product +OBT:000063 coagulase activity +OBT:000235 coagulase negative +OBT:000236 coagulase positive +OBT:000561 coal +OBT:002972 coal mine lake +OBT:002695 coal mine lake sediment +OBT:001489 coal mine waste +OBT:001039 coal spoil +OBT:001958 coal spoil heap +OBT:001490 coal-cleaning residue +OBT:002401 coarse beach sand +OBT:000562 coast +OBT:002776 coastal aquifer +OBT:001040 coastal fish farm +OBT:001491 coastal lagoon mud +OBT:001492 coastal sand +OBT:002402 coastal sediment +OBT:003132 coastal water +OBT:001041 coastal wetland +OBT:000563 coccobacillus +OBT:000564 cochlear prosthesis +OBT:001493 cockroach +OBT:001959 cocktail drink +OBT:001960 cocoa +OBT:001961 cocoa bean +OBT:001494 cocoa bean and primary derivative thereof +OBT:001962 cocoa beverage +OBT:001963 cocoa butter +OBT:001964 cocoa powder +OBT:002973 coconut +OBT:001495 coconut and primary derivative thereof +OBT:001965 coconut as food +OBT:003353 coconut beverage +OBT:003376 coconut milk +OBT:003377 coconut water +OBT:002897 cod +OBT:002403 cod meat +OBT:003191 cod roe +OBT:000565 coelom fluid +OBT:000237 coelomic cavity +OBT:002974 coffee +OBT:001966 coffee beverage +OBT:002777 coffee plant +OBT:000566 coffeemaker +OBT:002975 cold resistant +OBT:000567 cold seep +OBT:002404 cold sensitive +OBT:002405 cold soil +OBT:000238 cold temperature environment +OBT:002406 cold tolerant +OBT:002696 cold-seep sediment +OBT:001967 collard green +OBT:001042 college +OBT:001496 colon +OBT:000064 colony morphology phenotype +OBT:000568 combustible liquid +OBT:001043 comma-shaped +OBT:000239 commensal +OBT:000240 commodity and primary derivative thereof +OBT:000569 common millet feed +OBT:003254 common mushroom +OBT:001968 common mushroom and related product +OBT:002697 community +OBT:000570 composite food +OBT:000571 compost +OBT:002898 compost biofilter +OBT:000241 composting reactor +OBT:001969 compote +OBT:003470 Comté +OBT:001497 concentrated food +OBT:001044 condiment +OBT:001045 confectionery +OBT:000242 conidial +OBT:001498 conifer +OBT:000243 conjugating +OBT:000065 conjugation phenotype +OBT:000572 conjunctiva +OBT:000066 constructed habitat +OBT:003192 constructed wetland +OBT:003446 contaminated aquifer +OBT:001970 contaminated drinking water +OBT:003042 contaminated groundwater +OBT:002778 contaminated sediment +OBT:000244 contaminated site +OBT:003043 contaminated soil +OBT:003409 contaminated soil with total petroleum hydrocarbon +OBT:002976 contaminated water +OBT:001971 cooked fish +OBT:001499 cooked food +OBT:002407 cool soil +OBT:001500 cooled food +OBT:001046 cooling tower +OBT:001047 cooling water +OBT:000245 copiotrophic +OBT:001048 coral +OBT:003133 coral reef water +OBT:003410 corn beverage +OBT:001501 corn chip +OBT:003447 corn milk +OBT:001502 corn silage +OBT:001503 corn tortillas +OBT:000573 cornea +OBT:001049 corneal ulcer +OBT:001504 coronary artery +OBT:001050 cortex +OBT:001051 cortical bone +OBT:000574 cosmetics +OBT:002408 cottage cheese +OBT:002977 cotton cultivated soil +OBT:002779 cotton plant +OBT:001052 cotton swab +OBT:001053 cotton-waste compost +OBT:002409 cotyledon +OBT:000575 countertop +OBT:002410 courgette +OBT:001972 courgette and related product +OBT:001973 couscous +OBT:003572 cow +OBT:001974 cow barn +OBT:001975 cow milk +OBT:003354 cowpea beverage +OBT:003378 cowpea milk +OBT:001976 crab +OBT:001977 crab and product thereof +OBT:002411 crab as food +OBT:000576 crabtree negative +OBT:000577 crabtree positive +OBT:001978 cracker +OBT:001979 cranberry and primary derivative thereof +OBT:002412 cranberry as food +OBT:003581 cranberry juice +OBT:000974 Crassulaceae +OBT:001505 cream +OBT:003559 cream cheese +OBT:002413 cream pastry +OBT:001054 cream pigmented +OBT:002414 creamery +OBT:001055 creek +OBT:002415 creek sediment +OBT:002899 creek water +OBT:002322 Crème de Brie de Meaux +OBT:001056 creosol +OBT:003379 creosote contaminated soil +OBT:003411 creosote wood preservative-contaminated soil +OBT:001506 cresote treated wood +OBT:001507 crocodile +OBT:001980 crocodile product +OBT:002416 croissant +OBT:000246 crop +OBT:001057 crucifer +OBT:001058 crude oil +OBT:003335 crude-oil-contaminated seawater +OBT:000247 cruise ship +OBT:002698 cruise ship passenger +OBT:001059 crustacean +OBT:001508 crustacean and product thereof +OBT:002417 crusty bread +OBT:000248 cryophile +OBT:000578 cuboidal +OBT:002418 cucumber +OBT:001981 cucumber and related product +OBT:003582 cucumber juice +OBT:003255 cultivated crucifer +OBT:001982 cultivated field +OBT:000067 cultivated habitat +OBT:002780 cultivated Leguminosae +OBT:002699 cultivated plant +OBT:000579 culture system +OBT:001983 cumin +OBT:003134 cupcake +OBT:001984 cured food +OBT:001985 curry powder +OBT:000580 curtain +OBT:003135 custard cake +OBT:003176 Cu-stressed soil +OBT:000581 cut +OBT:001060 cuticle +OBT:001509 cutlery +OBT:000582 cutting board +OBT:001510 cutting table +OBT:001986 cuttlefish and product thereof +OBT:000249 cyanide treatment bioreactor +OBT:001061 cyanobacterial mat +OBT:003193 dadih +OBT:003194 dahi +OBT:001987 dairy barn +OBT:003599 dairy cow +OBT:000583 dairy farm +OBT:000250 dairy farm equipment +OBT:001062 dairy farming waste +OBT:003594 dairy goat +OBT:001988 dairy industry +OBT:003315 dairy livestock +OBT:001511 dairy parlour waste +OBT:003598 dairy sheep +OBT:001512 dairy soup +OBT:000584 dairy starter culture +OBT:003283 dairy wastewater treatment plant +OBT:001513 dambo +OBT:002879 Damselfish +OBT:003502 Danablu +OBT:002323 Danish pastry +OBT:001514 dark chocolate +OBT:001515 dashi +OBT:001989 date and primary derivative thereof +OBT:002419 date as food +OBT:001990 date palm tree +OBT:001516 dead animal +OBT:001991 dead body +OBT:000585 dead matter +OBT:001063 dead organism +OBT:001064 dead tissue +OBT:000586 dead wood +OBT:002781 decantation tank +OBT:000068 decarboxylase activity +OBT:000251 decarboxylase negative +OBT:000252 decarboxylase positive +OBT:003136 decaying apple +OBT:003137 decaying bamboo leaf +OBT:003138 decaying bark +OBT:003089 decaying fruit +OBT:001517 decaying insect-invaded wood +OBT:003090 decaying leaf +OBT:003529 decaying leaf litter +OBT:003591 decaying leaf litter from a pine forest +OBT:001518 decaying marine algae +OBT:001065 decaying matter +OBT:003044 decaying plant material +OBT:003091 decaying wood +OBT:000587 deep periodontal lesion +OBT:000588 deep sea +OBT:001519 deep sea mud +OBT:000253 deep subsurface +OBT:000589 deep tissue +OBT:002782 deep-sea hot vent +OBT:002783 deep-sea hydrothermal vent +OBT:002784 deep-sea hydrothermal vent chimney +OBT:002420 deep-sea sediment +OBT:003380 deer +OBT:002700 deer herd meat +OBT:002421 deer meat +OBT:002422 deer tick +OBT:001066 defrosted food +OBT:002785 dendritic cell +OBT:000254 denitrification reactor +OBT:001067 dental abscess +OBT:000590 dental biofilm +OBT:001068 dental caries +OBT:000255 dental chair +OBT:001520 dental plaque +OBT:000591 dental prothesis +OBT:001521 dental root +OBT:001522 dental root canal +OBT:000592 desert +OBT:001069 desert rock +OBT:000593 desert soil +OBT:001992 desiccation resistant +OBT:001993 desiccation sensitive +OBT:001994 desiccation tolerant +OBT:001995 deteriorated canned food +OBT:001996 diadromous fish meat +OBT:000256 diagnostic equipment +OBT:000257 diazotroph +OBT:001070 dietary supplement +OBT:002786 digester +OBT:001523 digester sludge +OBT:003256 digestive chamber +OBT:000258 digestive system part +OBT:003284 diner +OBT:001071 dining car +OBT:001072 dinner plate +OBT:000069 diploid +OBT:001997 disease resistant +OBT:001998 disease sensitive +OBT:001999 disease tolerant +OBT:001073 dish +OBT:002787 dish towel +OBT:000594 dishcloth +OBT:000595 dishwasher +OBT:001074 district heating plant +OBT:001524 ditch mud +OBT:002423 ditch sediment +OBT:000259 ditch water +OBT:003092 dog +OBT:002424 dog tick +OBT:003045 dolphin +OBT:002425 dolphinfish meat +OBT:000596 domestic animal +OBT:000260 domestic appliance +OBT:001525 domestic sewage +OBT:003257 domestic wastewater treatment plant +OBT:003139 donkey +OBT:003412 dosa +OBT:002000 dough mixer +OBT:002001 dragee +OBT:001075 drainage +OBT:001076 drainage canal +OBT:001077 drainage ditch +OBT:002002 dried animal product +OBT:002003 dried bean +OBT:001526 dried food +OBT:002004 dried lentil +OBT:001527 dried nut +OBT:002005 dried pasta +OBT:002006 dried pea +OBT:002007 dried plant product +OBT:002008 dried seaweed +OBT:002788 drilling bore water +OBT:001528 drilling mud +OBT:001529 drilling pipe +OBT:001078 drink +OBT:001079 drinking glass +OBT:001530 drinking water +OBT:000597 drinking water facility +OBT:001080 drinking water filter +OBT:001081 drinking water reservoir +OBT:001082 drinking water supply +OBT:002978 drinking water system +OBT:002900 drinking water treatment plant +OBT:001531 drosophila +OBT:002324 Drosophila melanogaster +OBT:000261 drug +OBT:000598 drug resistant +OBT:000599 drug sensitive +OBT:000600 drug tolerant +OBT:001532 dry forest humus +OBT:003195 dry sausage +OBT:002426 dry soil +OBT:003316 duck +OBT:002009 duck egg +OBT:002427 duck meat +OBT:002901 dumpster +OBT:000601 dune soil +OBT:001533 duodenal ulcer +OBT:001534 duodenum +OBT:002010 durian and primary derivative thereof +OBT:002428 durian as food +OBT:000262 durotactic +OBT:002701 durum wheat product +OBT:000263 dust +OBT:002902 dustbin +OBT:002979 dye textile wastewater +OBT:000602 ear +OBT:000603 ear canal +OBT:000264 ear part +OBT:001083 ear thermometer +OBT:000604 ear wax +OBT:000605 earring hole +OBT:000027 Earth +OBT:000606 earth +OBT:002011 earthworm-eating bird +OBT:000607 eating and drinking place +OBT:001084 eating utensil +OBT:001535 echinoderm and product thereof +OBT:003242 Éclair +OBT:000608 ectomycorrhizal fungus +OBT:000609 ectoparasite +OBT:000610 edible film +OBT:002980 edible oil and related product +OBT:001085 eel farm +OBT:002429 eel meat +OBT:000611 effluent +OBT:002981 egg +OBT:001086 egg and egg product +OBT:001536 egg based dish +OBT:002430 egg cell +OBT:000612 egg part +OBT:002012 egg sac +OBT:002013 egg sac part +OBT:001537 egg white +OBT:001538 egg yolk +OBT:002702 einkorn wheat product +OBT:001539 elbow +OBT:003285 elderly person +OBT:001540 electronics device industry +OBT:000265 electrotactic +OBT:003413 elk +OBT:002431 elk meat +OBT:001541 elkhorn coral +OBT:001542 elm +OBT:001543 embryo +OBT:002014 embryo +OBT:002015 embryo part +OBT:002432 embryonic axis +OBT:002433 embryonic axis part +OBT:002703 embryonic root +OBT:002704 embryonic root part +OBT:000613 embryonic structure +OBT:003140 emesis basin +OBT:003471 Emmental +OBT:002705 emmer wheat product +OBT:002434 emperor meat +OBT:001087 empyema +OBT:000614 encapsulated +OBT:001088 endodermis +OBT:000070 endolithic +OBT:000266 endolithic environment +OBT:000071 endolython +OBT:000615 endometrium +OBT:000616 endoparasite +OBT:000072 endopelic +OBT:000073 endopelon +OBT:000617 endophyte +OBT:000074 endophyton +OBT:000075 endopsammic +OBT:000076 endopsammon +OBT:002016 endosperm +OBT:000267 endosphere +OBT:002789 endothelium +OBT:000077 endozoon +OBT:000618 endozootic +OBT:002435 enriched bread +OBT:002017 enriched dough +OBT:002436 enriched soil +OBT:000268 enterocyte +OBT:000269 environment water +OBT:000078 environment wrt oxygen level +OBT:000079 environmental matter +OBT:002018 environmental water with chemical property +OBT:002019 environmental water with physical property +OBT:001544 eosinophil +OBT:001089 epidermis +OBT:001090 epidermis part +OBT:000080 epilythic +OBT:000081 epilython +OBT:000082 epipelic +OBT:000083 epipelon +OBT:000084 epipsammic +OBT:000085 epipsammon +OBT:000619 epithelial layer +OBT:001091 epithelium +OBT:000086 epixylic +OBT:000087 epixylon +OBT:000088 epizoon +OBT:000620 epizootic +OBT:003503 Epoisses +OBT:002020 equine meat +OBT:002437 erythrocyte +OBT:002790 erythromycin resistant +OBT:002438 escarole +OBT:002021 escarole and related product +OBT:002791 eschar +OBT:002706 estuarine sediment +OBT:001092 estuary +OBT:002792 ethambutol resistant +OBT:001545 eucalyptus tree +OBT:000089 eukaryote host +OBT:003123 Euprymna scolopes +OBT:002439 eutrophic water +OBT:001093 evaporator +OBT:003566 ewe +OBT:002022 ewe milk +OBT:000621 excavation +OBT:000622 excreta +OBT:000270 exoskeleton +OBT:000271 exosphere +OBT:000007 experimental medium +OBT:003489 extra hard cheese +OBT:000274 extracellular +OBT:000624 extractive industrial site +OBT:001094 extractive industry equipment +OBT:000272 extra-genital +OBT:000273 extra-intestinal +OBT:000623 extra-uterus +OBT:003286 extreme acid mine drainage +OBT:002982 extreme cold resistant +OBT:002440 extreme cold sensitive +OBT:002441 extreme cold tolerant +OBT:000090 extreme environment +OBT:001095 extreme halophile +OBT:002983 extreme heat resistant +OBT:002442 extreme heat sensitive +OBT:002443 extreme heat tolerant +OBT:000625 extreme high temperature environment +OBT:000275 extreme thermophile +OBT:000626 extremely acid environment +OBT:000276 extremely acidophilic +OBT:000627 eye +OBT:000277 eye part +OBT:001096 facial tissue +OBT:001097 factory +OBT:000278 facultative aerobe +OBT:000279 facultative anaerobe +OBT:001098 facultative chemo lithotroph +OBT:003560 faisselle +OBT:000091 farm +OBT:002903 farmed fish +OBT:002023 farmed fish meat +OBT:003317 farmer +OBT:003141 farmyard manure +OBT:001099 fast food restaurant +OBT:001100 fat body +OBT:002707 faucet handle +OBT:001101 feather pillow +OBT:001102 feces +OBT:000280 feeder cell +OBT:001103 female animal +OBT:003196 female tsetse fly +OBT:001104 fen +OBT:002444 fennel +OBT:002024 fennel and related product +OBT:003287 fenugreek +OBT:002025 fermentation vat +OBT:001105 fermentative +OBT:003142 fermented beet +OBT:003343 fermented beverage +OBT:003143 fermented butter +OBT:003600 fermented cabbage juice +OBT:003144 fermented cereal-based product +OBT:003381 fermented cheese +OBT:003592 fermented cottage cheese +OBT:003093 fermented dairy product +OBT:003414 fermented dough +OBT:003555 fermented dry sausage +OBT:003597 fermented Elaeis Palm sap +OBT:003145 fermented fish product +OBT:001546 fermented food +OBT:003556 fermented fresh cheese +OBT:003344 fermented fruit +OBT:003596 fermented juice +OBT:003046 fermented liquid +OBT:003094 fermented meat +OBT:003146 fermented milk +OBT:003047 fermented plant-based food +OBT:003095 fermented seafood +OBT:003147 fermented shrimp paste +OBT:003355 fermented soybean +OBT:003573 fermented table olive +OBT:003533 fermented tea leaf +OBT:003096 fermented vegetable product +OBT:002026 ferret +OBT:002984 ferrous iron oxidizing +OBT:003148 fertilized soil +OBT:000281 fertilizer +OBT:003448 feta +OBT:000092 field +OBT:002445 field soil +OBT:002027 fig and primary derivative thereof +OBT:002446 fig as food +OBT:000629 filamentous +OBT:000628 filament-shaped +OBT:002028 filarial nematode +OBT:001547 fillet +OBT:003197 Filmjölk +OBT:002447 filtered tap water +OBT:003415 final sourdough +OBT:002448 finch +OBT:000282 finger +OBT:002029 fir tree +OBT:002793 fish +OBT:001548 fish based dish +OBT:000630 fish farm +OBT:002030 fish farming pond +OBT:000631 fish meal +OBT:001549 fish meat and fish meat product +OBT:000283 fish pen +OBT:000093 fish pond +OBT:003149 fish roe and product thereof +OBT:002031 fish sashimi +OBT:001550 fish soup +OBT:003150 fish waste +OBT:002032 fish-eating bird +OBT:003198 fjord water +OBT:000284 flagellum +OBT:003199 flatfish-Sikhae +OBT:002449 flavoured water +OBT:003416 flax milk +OBT:002033 flaxseed +OBT:001551 flaxseed and primary derivative thereof +OBT:003382 flaxseed beverage +OBT:002034 flea +OBT:000285 flesh +OBT:000286 flocculent +OBT:002794 flooded soil +OBT:002035 flor +OBT:002450 flouder meat +OBT:001552 flour +OBT:002904 flower +OBT:001106 flower part +OBT:002795 flowing water +OBT:000632 fluffy colony +OBT:001107 fluorescent pigmented +OBT:002036 fluvial dambo +OBT:001553 fly +OBT:002037 foie gras +OBT:002038 folivorous bird +OBT:003472 Fontina +OBT:000008 food +OBT:001554 food blender +OBT:001108 food booth +OBT:001109 food cart +OBT:001110 food container +OBT:001555 food dicing machine +OBT:001556 food fermentation equipment +OBT:002039 food fermentation industry +OBT:001111 food flavour +OBT:000094 food for human +OBT:000633 food for particular diet +OBT:001112 food processing appliance +OBT:001113 food processing effluent +OBT:001557 food processing factory +OBT:001558 food processing waste +OBT:002796 food processing wastewater +OBT:001114 food rind +OBT:001559 food slicing machine +OBT:001115 food truck +OBT:001116 foot +OBT:002451 footwarm bath +OBT:000287 forage +OBT:001117 foregut +OBT:001118 forehead thermometer +OBT:001119 forest +OBT:001120 forest humus +OBT:003417 forest musk deer +OBT:002452 forest pond sediment +OBT:000634 forest soil +OBT:001561 forest tree +OBT:001560 fore-stomach +OBT:002040 fork as utensil +OBT:003473 Formaggio di Fossa +OBT:003288 formula fed infant +OBT:001562 fossil stromatolite +OBT:002041 fowl +OBT:003048 fox +OBT:002708 fracture water +OBT:000288 free-living +OBT:000635 free-living diazotroph +OBT:000636 freezer +OBT:002985 freeze-thaw resistant +OBT:002453 freeze-thaw sensitive +OBT:002454 freeze-thaw tolerant +OBT:000289 freight transport equipment +OBT:003557 French dry sausage +OBT:002042 fresh animal manure +OBT:002043 fresh cheese +OBT:003049 fresh meat +OBT:002044 fresh pasta +OBT:002455 freshwater +OBT:000637 freshwater aquarium +OBT:002456 freshwater bream meat +OBT:002905 freshwater fish +OBT:002045 freshwater fish meat +OBT:003050 freshwater hotspring +OBT:003200 freshwater lake +OBT:003051 freshwater marsh +OBT:001563 freshwater mud +OBT:002046 freshwater sediment +OBT:001121 freshwater wetland +OBT:002047 fried food +OBT:002048 fried rice +OBT:002049 frikadeller +OBT:003052 frog +OBT:002457 frog leg +OBT:002050 frog product +OBT:003561 fromage blanc +OBT:001564 frozen food +OBT:002458 frozen soil +OBT:003449 frozen yogurt +OBT:002906 fruit +OBT:001122 fruit and primary derivative thereof +OBT:001565 fruit based dish +OBT:003534 fruit based juice +OBT:002051 fruit fly +OBT:001123 fruit part +OBT:001566 fruit rind +OBT:002052 fruit salad +OBT:002053 fruit smoothie +OBT:001567 fruit tree +OBT:001568 fruit with edible peel and primary derivative thereof +OBT:001569 fruit with inedible peel and primary derivative thereof +OBT:002054 fruit-eating bird +OBT:001570 fruiting vegetable +OBT:001571 fuel ethanol production facility +OBT:001572 fuel oil piping system +OBT:003562 fuet +OBT:000290 fungi +OBT:001573 fungi and related product +OBT:002055 fungi as food +OBT:001574 funicle +OBT:000291 furniture +OBT:003258 furuncle +OBT:001124 furuncle fluid +OBT:003540 Fuzhuan brick tea +OBT:000638 gall bladder +OBT:002056 game bird meat +OBT:002057 game mammal meat +OBT:002459 gamefowl +OBT:000292 garden +OBT:003097 garden lettuce +OBT:001125 garden plant +OBT:000639 garden soil +OBT:001126 garden vegetable and primary derivative thereof +OBT:002058 garlic and related product +OBT:002460 garlic as food +OBT:003535 garlic oil +OBT:003151 garlic plant +OBT:001575 gas piping system +OBT:001127 gas seep +OBT:001576 gas tank +OBT:000640 gastric acid +OBT:001128 gastric antrum +OBT:001129 gastric body +OBT:001577 gastric mucosa +OBT:000641 gastrointestinal tract +OBT:000642 gastrointestinal tract part +OBT:001578 gazpacho +OBT:002059 geese egg +OBT:001130 gelatine as ingredient +OBT:003152 gelato +OBT:000643 genital tract +OBT:002797 gentamicin resistant +OBT:002798 gentamicin sensitive +OBT:002799 geothermal aquifer +OBT:000293 geothermal area +OBT:001579 geothermal lake +OBT:000294 germ cell +OBT:001580 geyser +OBT:002461 gherkin +OBT:002060 gherkin and related product +OBT:000644 gill +OBT:003201 ginger +OBT:002061 ginger and related product +OBT:000645 gingival crevice +OBT:000646 gingival lesion +OBT:001581 gingival sulcus +OBT:002800 ginseng plant +OBT:003259 girl +OBT:001582 gizzard +OBT:002709 glacier +OBT:003202 glassy rim of the pillow basalt +OBT:001583 glassy rind of lava +OBT:002062 glassy rind of seafloor basalt +OBT:000647 glial cell +OBT:000295 gliding +OBT:000470 Glomus vesiculiferum +OBT:003177 Glossina +OBT:002462 gluten-free bread +OBT:002801 glycopeptide antibiotics resistant +OBT:002063 gnocchi +OBT:003490 goat +OBT:002064 goat meat +OBT:002065 goat milk +OBT:001584 gold mine +OBT:003289 gold mine wastewater +OBT:001131 golden pigmented +OBT:003318 goose +OBT:002463 goose meat +OBT:003504 Gorgonzola +OBT:003474 Gouda +OBT:003053 goulash +OBT:002710 graft recipient +OBT:001132 grain and primary derivative thereof +OBT:003054 grain based drink +OBT:000296 gram stain phenotype +OBT:000648 gram-negative +OBT:000649 gram-positive +OBT:000650 gram-variable +OBT:003153 granita +OBT:000651 granite stone +OBT:001133 granitic rock +OBT:002802 granitic rock aquifer +OBT:000297 granular +OBT:003475 Granular +OBT:001134 granulocyte +OBT:000298 granuloma +OBT:002986 grape +OBT:002066 grape and primary derivative thereof +OBT:003583 grape juice +OBT:002464 grape leaf +OBT:002067 grape leaf and related product +OBT:000652 grape-like clustered +OBT:002803 grapevine +OBT:001135 grass plant +OBT:001136 grass silage +OBT:000299 grassland +OBT:002907 gravel aquifer +OBT:000300 gravitactic +OBT:003558 Greek sausage +OBT:000653 green forage +OBT:001137 green pigmented +OBT:001585 green tea leaf +OBT:000095 greenhouse +OBT:002465 greenhouse soil +OBT:002068 griddled food +OBT:002069 grilled food +OBT:001586 grocery +OBT:000654 groin +OBT:002466 ground beef +OBT:001138 ground food +OBT:002467 ground water +OBT:002468 groundwater body +OBT:000655 growing plant +OBT:003476 Gruyère +OBT:003356 guacamole +OBT:001587 guano +OBT:001139 guar gum +OBT:003290 guar plant +OBT:003505 Gubbeen +OBT:002469 guinea fowl meat +OBT:002470 guinea pig +OBT:002070 gum drop +OBT:003098 gum margin +OBT:003055 gum tissue +OBT:003124 Gundruk +OBT:001140 gut +OBT:000009 habitat wrt chemico-physical property +OBT:002471 haddock meat +OBT:002908 hagfish +OBT:000301 hair +OBT:001141 hairspray +OBT:002472 hake meat +OBT:002473 halibut meat +OBT:000302 haline environment +OBT:000656 halophile +OBT:000657 halophobe +OBT:002711 halotolerant +OBT:002474 ham +OBT:002071 hamburger +OBT:001588 hamburger meat +OBT:002475 hamster +OBT:001589 hand +OBT:003506 Hand cheese +OBT:000658 handkerchief +OBT:000096 haploid +OBT:000659 harbor +OBT:003450 hard cheese +OBT:002072 hard tick +OBT:003056 hare +OBT:002476 hare meat +OBT:000660 harvesting tool +OBT:000303 hatchery +OBT:002073 hay +OBT:002074 hazelnut +OBT:001590 hazelnut and primary derivative thereof +OBT:003357 hazelnut beverage +OBT:003383 hazelnut milk +OBT:001142 head +OBT:002477 head cabbage +OBT:002075 head cabbage and related product +OBT:000661 head kidney +OBT:002712 healthy person +OBT:002713 heart +OBT:000662 heart valve +OBT:001591 heartwood +OBT:002909 heat exchanger +OBT:002987 heat resistant +OBT:002478 heat sensitive +OBT:002479 heat stressed soil +OBT:002480 heat tolerant +OBT:001592 heat-preserved food +OBT:003203 heavy metal contaminated soil +OBT:000663 hemodialysis machine +OBT:000664 hemolymph +OBT:000304 hemolytic +OBT:003418 hemp milk +OBT:002076 hemp seed +OBT:001593 hemp seed and primary derivative thereof +OBT:003384 hempseed beverage +OBT:003319 hen +OBT:002077 hen egg +OBT:003204 herbicide enriched soil +OBT:002078 herbivore +OBT:002481 herring emat +OBT:000305 heterothallic +OBT:000665 heterotroph +OBT:000666 high chair tray +OBT:000306 high osmolarity environment +OBT:000307 high pressure environment +OBT:002079 high pressure treated food +OBT:000667 high salt concentration environment +OBT:000308 high sulfur concentration environment +OBT:000309 high temperature environment +OBT:001594 high temperature oil field +OBT:002988 high-level radioactive sediment +OBT:000668 highly acid environment +OBT:003260 highly alkaline saline soda lake +OBT:002482 hilum +OBT:001143 hindgut +OBT:001144 hip bone +OBT:001595 holoplankton +OBT:000669 home drainage system +OBT:000670 home food processing equipment +OBT:000671 home heating system +OBT:002804 home plumbing +OBT:001145 home-made food +OBT:000310 homothallic +OBT:001596 honey +OBT:001146 honey and apiculture product +OBT:002080 honey bee +OBT:001147 hoof +OBT:003205 hopped wort +OBT:003320 horse +OBT:003154 horse manure +OBT:002483 horseradish +OBT:002081 horseradish and related product +OBT:001148 horticultural waste +OBT:000311 horticulture farm +OBT:002714 hospital +OBT:002484 hospital bed +OBT:002989 hospital drinking water +OBT:000097 hospital environment +OBT:002082 hospital equipment +OBT:000672 hospital gown +OBT:000673 hospital hot water +OBT:003155 hospital humidifier +OBT:002485 hospital nebulizer +OBT:003358 hospital tap water +OBT:000312 hospital water +OBT:002715 hospital water distribution system +OBT:002486 hospital water supply +OBT:000313 host associated biofilm +OBT:001597 hot dog +OBT:001598 hot drink +OBT:001599 hot mud +OBT:000674 hot spring biomat +OBT:002487 hot tap water +OBT:002805 hot water distribution system +OBT:001149 hot water tank +OBT:000675 hotel +OBT:001150 hotel bathroom +OBT:000676 hotel carpet +OBT:000314 hotel equipment +OBT:002990 hotspring +OBT:000315 house +OBT:000098 household good +OBT:000316 household product +OBT:000677 household waste +OBT:002488 human +OBT:001152 human appendix abscess +OBT:001151 human Bartholin abscess +OBT:000678 human body +OBT:002716 human body louse +OBT:002489 human filarial nematode +OBT:002717 human head louse +OBT:002490 human louse +OBT:002083 human milk +OBT:002806 human pathogen +OBT:001153 humidifier +OBT:000679 humus +OBT:000680 humus soil +OBT:003099 humus-rich acidic ash soil +OBT:001600 hydathode +OBT:003359 hydrocarbon contaminated soil +OBT:002991 hydrogen oxidizing +OBT:000317 hydrotactic +OBT:000318 hydrotelluric environment +OBT:002718 hydrothermal vent +OBT:002719 hydrothermal vent chimney +OBT:002491 hyper saline brine sediment +OBT:003261 hypersaline lake +OBT:001154 hypersaline microbial mat +OBT:002720 hypersaline water +OBT:000319 hyperthermophile +OBT:000099 hyphae growth +OBT:002721 hypocotyl +OBT:000681 ice +OBT:003100 ice cream +OBT:002084 ice tea +OBT:002085 ice-cream factory +OBT:003057 iguana +OBT:001601 ileum +OBT:003156 ill person +OBT:003101 illuminated anoxic zone of aquatic environment +OBT:000320 immune cell +OBT:000321 immune system +OBT:003206 immunodeficient person +OBT:001155 incontinence pad +OBT:001156 indigo pigmented +OBT:000682 indoor air +OBT:002086 industrial bakery +OBT:000683 industrial building +OBT:000322 industrial chemical +OBT:002807 industrial effluent treatment plant +OBT:000684 industrial equipment +OBT:000100 industrial habitat +OBT:001157 industrial organic waste +OBT:000323 industrial product +OBT:001158 industrial scrap +OBT:000324 industrial site +OBT:001159 industrial sludge +OBT:000685 industrial waste +OBT:003262 industrial waste water treatment plant +OBT:002808 industrial wastewater +OBT:000325 industrial water and effluent +OBT:001160 indwelling urinary catheter +OBT:003263 infant +OBT:001161 infant formula +OBT:001162 ingredient for hot drink +OBT:000326 inland water body +OBT:003291 inmate +OBT:000686 innate immune system +OBT:000687 inner ear +OBT:002910 inorganically contaminated sediment +OBT:001163 insect +OBT:000688 insect part +OBT:002809 insect pathogen +OBT:002087 insect product +OBT:003207 insecticide enriched soil +OBT:000327 inside the body +OBT:002088 integument +OBT:000328 intensive care unit +OBT:000101 intercellular +OBT:000689 interstitial fluid +OBT:001854 Intertidal sand +OBT:002722 intertidal sediment +OBT:000690 intertidal zone +OBT:000691 intestinal content +OBT:002992 intestinal epithelium +OBT:002993 intestinal mucosa +OBT:000471 Intestinal mucosal lesion +OBT:002810 intestine +OBT:000329 intracellular +OBT:001602 intra-uterine progeny +OBT:000692 intra-uterus +OBT:000693 intravascular catheter +OBT:000694 invertebrate species +OBT:001603 ionised food +OBT:002492 ionizing radiation resistant +OBT:002493 ionizing radiation sensitive +OBT:002494 ionizing radiation tolerant +OBT:000102 iron-rich environment +OBT:001164 irrigation ditch +OBT:002811 isoniazid resistant +OBT:002495 jacuzzi +OBT:000695 jail +OBT:002089 jam +OBT:001604 jarred food +OBT:001605 jejunum +OBT:001165 jellyfish +OBT:001606 jellyfish and product thereof +OBT:001166 joint +OBT:001607 jointvetch +OBT:003058 juice +OBT:000696 jungle +OBT:002090 kaki and primary derivative thereof +OBT:002496 kaki as food +OBT:002497 kangaroo +OBT:003208 kazunoko +OBT:003102 kebab +OBT:003209 kefir +OBT:002911 kelp +OBT:001167 kerosene +OBT:002723 khorasan product +OBT:002724 kidney +OBT:003419 kimchi +OBT:002325 Kiri +OBT:000697 kitchen +OBT:000330 kitchen equipment +OBT:001168 kitchen garbage +OBT:000698 kitchen sink +OBT:000699 kitchen sponge +OBT:002994 kiwi +OBT:002091 kiwi and primary derivative thereof +OBT:002498 kiwi as food +OBT:001608 knee +OBT:002326 Kniekiechl +OBT:002092 knife +OBT:002093 kombu +OBT:002094 kombucha +OBT:003210 kumis +OBT:000028 L-( null )-tartrate enriched soil +OBT:001169 laboratory +OBT:000700 laboratory animal +OBT:000103 laboratory equipment +OBT:003211 laboratory mice +OBT:003212 laboratory rat +OBT:003420 laboratory sourdough +OBT:001170 lager yeast +OBT:003213 lagoon +OBT:003477 Laguiole +OBT:001171 lake +OBT:002499 lake sediment +OBT:003569 lamb +OBT:002095 lamb meat +OBT:000331 landfill +OBT:003385 landfill contaminated by PCB +OBT:002096 landfill leachate +OBT:001172 landfill site waste +OBT:002500 landfowl +OBT:003214 långfil +OBT:003507 Langres +OBT:003292 large intestine +OBT:001609 larvae +OBT:000701 laryngeal prosthetic device +OBT:002097 lasagna +OBT:001610 latex processing factory +OBT:000702 latrine +OBT:002501 laurel +OBT:002098 laurel and related product +OBT:002502 lavander +OBT:002099 lavander and related product +OBT:001173 leachate +OBT:002812 leaching column +OBT:001174 leaf +OBT:003059 leaf based drink +OBT:003060 leaf epidermis +OBT:002912 leaf litter +OBT:001611 leaf margin +OBT:001175 leaf part +OBT:001612 leafhopper +OBT:000703 leafy soil +OBT:001613 leafy vegetable +OBT:002100 lean dough +OBT:002503 leavened bread +OBT:003421 leavened dough +OBT:001614 leech +OBT:002504 leek +OBT:002101 leek and related product +OBT:003478 Leerdammer +OBT:002102 left arm +OBT:001176 leg +OBT:001615 legume based dish +OBT:003061 legume based drink +OBT:001177 legume seed and primary derivative thereof +OBT:001616 legume soup +OBT:002103 lemon and primary derivative thereof +OBT:002505 lemon as food +OBT:003584 lemon juice +OBT:000704 lentic water +OBT:000705 lentic water body +OBT:001178 lenticel +OBT:002104 lentil +OBT:001617 lentil and related product +OBT:001179 leopard +OBT:000332 lesion +OBT:002506 lettuce +OBT:002105 lettuce and related product +OBT:000706 leukocyte +OBT:000707 lichen +OBT:000104 lichen forming +OBT:003508 Liederkranz +OBT:000708 light organ +OBT:001180 lignocellulose +OBT:003509 Limburger +OBT:002106 lime and primary derivative thereof +OBT:002507 lime as food +OBT:001618 lime soap +OBT:000709 limestone +OBT:000710 lining +OBT:002913 lining of the small intestine +OBT:003062 lion +OBT:002107 liqueur +OBT:001181 liquid agricultural waste +OBT:000105 liquid culture morphology phenotype +OBT:002108 liquid egg mixed +OBT:001619 liquid egg product +OBT:002109 liquid egg white +OBT:002110 liquid egg yolk +OBT:000711 liquid food +OBT:000106 liquid medium +OBT:002111 litchi and primary derivative thereof +OBT:002508 litchi as food +OBT:000712 lithotroph +OBT:001620 litter +OBT:003510 Livarot +OBT:002725 liver +OBT:001182 liver abscess +OBT:003103 liver paste +OBT:002726 livestock +OBT:001621 livestock barn +OBT:003336 livestock boar +OBT:001183 livestock habitat +OBT:003104 livestock manure +OBT:000010 living organism +OBT:002112 lizard product +OBT:002113 lobster +OBT:002114 lobster and product thereof +OBT:002509 lobster as food +OBT:000713 lotic water body +OBT:002115 loukoumi +OBT:002116 louse +OBT:002510 louse-born +OBT:002511 low nutrient aquatic habitat +OBT:002117 low salinity mud flat sediment +OBT:003293 low temperature ground water +OBT:001184 lower gastrointestinal tract part +OBT:000333 lower layer of a microbial mat +OBT:000714 lower respiratory tract +OBT:000715 lucerne +OBT:001185 lumber +OBT:001186 lunch box +OBT:000716 lung +OBT:002118 lupin +OBT:001622 lupin and related product +OBT:003360 lupin beverage +OBT:003386 lupin milk +OBT:002119 lupin seed +OBT:000717 lymph +OBT:000718 lymph node +OBT:000719 lymphatic system +OBT:000334 lymphatic system part +OBT:001623 lymphocyte +OBT:001187 lynx +OBT:001188 lyophilized milk starter +OBT:000107 lytic +OBT:003574 mabisi +OBT:001189 machinery +OBT:003294 mackerel +OBT:002512 mackerel meat +OBT:002995 macrophage +OBT:000335 magnetotactic +OBT:002513 maize +OBT:002120 maize and primary derivative thereof +OBT:000720 maize feed +OBT:002914 maize plant +OBT:001190 maize silage +OBT:000721 maize storage +OBT:001191 malachite green effluent +OBT:001192 male animal +OBT:003215 malt +OBT:000722 malt feed +OBT:003567 malt vinegar +OBT:002813 malt vinegar brewery +OBT:001624 mammal meat +OBT:001625 mammalian +OBT:003295 mammalian livestock +OBT:000723 mammalian part +OBT:001193 mammary gland +OBT:003451 man +OBT:002121 mandarin and primary derivative thereof +OBT:002514 mandarin as food +OBT:002996 mango +OBT:002122 mango and primary derivative thereof +OBT:002515 mango as food +OBT:001626 manufacture +OBT:001627 manure +OBT:001194 manure compost +OBT:001628 maple tree +OBT:002123 mare milk +OBT:000336 mariculture farm +OBT:001629 marinated food +OBT:002915 marine and hypersaline microbial mat +OBT:002124 marine anoxic mud +OBT:002125 marine black mud +OBT:000337 marine cage +OBT:001195 marine coast +OBT:001630 marine crustacean +OBT:000338 marine environment +OBT:000724 marine eukaryotic species +OBT:003553 marine farm fish +OBT:003264 marine fish +OBT:002126 marine fish meat +OBT:000725 marine freight transport equipment +OBT:002516 marine marsh sediment +OBT:002814 marine microbial mat +OBT:001631 marine mud +OBT:001196 marine rock +OBT:002127 marine sediment +OBT:002997 marine sponge +OBT:002517 marine sulfidic sediment +OBT:003105 marine water +OBT:002815 marine wetland +OBT:001632 market garden plant +OBT:003511 Maroilles +OBT:001197 maroon pigmented +OBT:001198 marsh +OBT:002128 marshmallow +OBT:003216 masago +OBT:001199 mashed food +OBT:002998 mast cell +OBT:000172 MATa +OBT:000173 MATalpha +OBT:003217 matsoni +OBT:001633 mayonnaise +OBT:000339 meal +OBT:000340 meat and bone meal +OBT:001200 meat and meat product +OBT:001634 meat based dish +OBT:001635 meat hook +OBT:002129 meat industry +OBT:001636 meat juice +OBT:000726 meat meal +OBT:002130 meat patty +OBT:002131 meat sashimi +OBT:001637 meat soup +OBT:001638 meatball +OBT:003277 Medicago +OBT:000727 medical bed sheet +OBT:002518 medical center +OBT:000011 medical environment +OBT:000108 medical equipment +OBT:000728 medical glove +OBT:000729 medical mask +OBT:000341 medical outfit +OBT:000109 medical product +OBT:000110 medical sample +OBT:000342 medical sink +OBT:003321 medical staff +OBT:003387 meju +OBT:002999 melon +OBT:002132 melon and related product +OBT:002519 melon as food +OBT:002816 meltwater +OBT:000730 membrane +OBT:000731 meninges +OBT:003265 mercury-enriched soil +OBT:002133 merguez +OBT:001639 meringue +OBT:001201 meristem +OBT:001640 meromictic lake +OBT:001641 meroplankton +OBT:001642 mesenteric artery +OBT:001202 mesentery +OBT:001203 meso-halophile +OBT:000343 mesophile +OBT:000344 mesosphere +OBT:002520 mesotrophic water +OBT:003157 metal contaminated soil +OBT:002521 metal resistant +OBT:002522 metal sensitive +OBT:002523 metal tolerant +OBT:000345 metallic coin +OBT:000732 metaphytic +OBT:000111 metaphyton +OBT:003593 metata ayib +OBT:001643 methane seep +OBT:000733 methanogenic +OBT:000346 methanogenic reactor +OBT:001204 methanol oxidizing +OBT:001205 methanotroph +OBT:000734 methylotroph +OBT:002817 meticillin resistant +OBT:002818 meticillin sensitive +OBT:002819 meticillin tolerant +OBT:000347 microaerophile +OBT:000348 microaerophilic environment +OBT:000735 microaerotolerant +OBT:000736 microanaerobe +OBT:000001 microbial habitat +OBT:000737 microbial mat +OBT:000112 microbial mat layer +OBT:000002 microbial phenotype +OBT:000113 microflora +OBT:000114 microorganism +OBT:000012 microorganism associated habitat +OBT:000349 microorganism gas vesicle +OBT:000115 microorganism part +OBT:002134 micropyle +OBT:000116 microscopic morphological phenotype +OBT:000738 microwave oven +OBT:000740 middle ear +OBT:001206 midge +OBT:001207 midgut +OBT:002135 mid-ocean ridge basalt +OBT:001644 midrib +OBT:000739 mid-vaginal wall +OBT:003322 military service member +OBT:001645 milk +OBT:001208 milk and milk product +OBT:001646 milk chocolate +OBT:002136 milk product +OBT:002524 milk rice +OBT:000741 milking machine +OBT:002916 mill wastewater +OBT:001209 milled food +OBT:002525 millet +OBT:002137 millet and primary derivative thereof +OBT:001210 mine +OBT:001211 mine drainage +OBT:001212 mine waste +OBT:003266 mine waste water +OBT:000350 mineral matter +OBT:000742 mineral oil +OBT:000743 mineral soil +OBT:002526 mineral water +OBT:001647 mining slag heap +OBT:002527 mink +OBT:002528 mint +OBT:002138 mint and related product +OBT:000351 mire +OBT:001648 mite +OBT:001649 mixed cereal-based snack +OBT:001650 mixed dish +OBT:003106 mixed salad +OBT:002139 mixed vegetable +OBT:000744 mixotroph +OBT:001213 model plant +OBT:001214 moderate halophile +OBT:002140 modified-atmosphere-packed food +OBT:003000 moldy peanut +OBT:001215 mollusc +OBT:001651 mollusc and product thereof +OBT:003107 monkey +OBT:003001 monocyte +OBT:000352 monument +OBT:001216 moor +OBT:003422 moose +OBT:003479 Morbier +OBT:003244 Morcela de Arroz +OBT:003267 morel +OBT:002141 morel and related product +OBT:002142 mosquito +OBT:000745 moss +OBT:001652 moth +OBT:003491 mother +OBT:000117 motile +OBT:003492 mould ripened cheese +OBT:000746 mound +OBT:000747 mountain +OBT:002727 mouse +OBT:002143 moussaka +OBT:001653 mouth +OBT:001217 mouth part +OBT:002820 moxifloxacin resistant +OBT:002821 moxifloxacin sensitive +OBT:002822 moxifloxacin tolerant +OBT:003452 mozzarella +OBT:002144 mucocutaneous surface +OBT:001654 mucosal surface +OBT:001218 mucosal tissue +OBT:001219 mucous membrane +OBT:001220 mucus +OBT:001221 mud +OBT:001655 mud sediment +OBT:001222 mud volcano +OBT:000748 muddy water +OBT:001656 mudflat +OBT:001223 mudpit +OBT:001657 muesli +OBT:002145 mulberry tree +OBT:002529 mullet meat +OBT:000353 multicellular arrangement phenotype +OBT:002146 mummy +OBT:001658 mummy tissue +OBT:003296 mung bean plant +OBT:002917 municipal sewage plant +OBT:003002 municipal sewage sludge digester +OBT:001224 municipal sludge +OBT:001225 municipal solid waste +OBT:003218 munkoyo +OBT:003512 Munster +OBT:000749 mural painting +OBT:002530 murine +OBT:001226 muscle +OBT:000750 musculoskeletal system +OBT:000751 musculoskeletal system part +OBT:000752 mushroom +OBT:001659 mushroom based dish +OBT:002531 mushroom bed +OBT:000354 mushroom farm +OBT:002728 muskmelon +OBT:002532 muskrat +OBT:002147 mussel and product thereof +OBT:000753 mussel farm +OBT:001660 mustard +OBT:002148 mustelidae +OBT:000118 mutant +OBT:002149 mutton meat +OBT:002823 naladixic acid resistant +OBT:003003 naphthalene contaminated sediment +OBT:000754 nare +OBT:000755 nasal cavity +OBT:000756 nasal epithelia +OBT:001227 nasal passage abscess +OBT:001228 nasal secretion +OBT:001661 nasopharyngeal mucosa +OBT:000757 nasopharynx +OBT:000758 natron-alkaliphilic +OBT:003388 natto +OBT:000013 natural environment habitat +OBT:000759 natural gas +OBT:003361 natural gas-enriched soil +OBT:000355 naval surface ship +OBT:001229 neck +OBT:000760 necropolis +OBT:000761 necrotic lesion +OBT:000762 necrotrophic +OBT:002150 nectarine and primary derivative thereof +OBT:002533 nectarine as food +OBT:000356 needle +OBT:000763 negative aerotactic +OBT:000764 negative chemotactic +OBT:000174 Neisser stain phenotype +OBT:000472 Neisser-negative +OBT:000473 Neisser-positive +OBT:001230 nematode +OBT:000765 nerve +OBT:000766 nervous system +OBT:000357 nervous system part +OBT:000119 nest +OBT:003513 Neufchatel +OBT:000358 neuston +OBT:003063 neutral hotspring +OBT:000359 neutralophile +OBT:003004 neutrophil +OBT:001662 newborn animal +OBT:003453 newborn infant +OBT:003005 nitrobenzene contaminated sediment +OBT:001663 nitrogen fertilizer factory +OBT:002534 nitrogen-poor soil +OBT:000120 non motile +OBT:001664 non-biofilm forming +OBT:000121 non-immune serum +OBT:000122 non-sporulating +OBT:002824 norfloxacin resistant +OBT:002825 norfloxacin sensitive +OBT:002826 norfloxacin tolerant +OBT:000767 nose +OBT:002151 nougat +OBT:000768 noxious plant +OBT:002918 nozzle +OBT:003080 Ntoba Mbodi +OBT:002152 nucellus +OBT:002729 nugget +OBT:001231 nursing home +OBT:003297 nursing home resident +OBT:001665 nut +OBT:001232 nut and primary derivative thereof +OBT:003064 nut based drink +OBT:002153 nutmeg +OBT:000360 nutrient broth +OBT:002535 nutrient-poor soil +OBT:001666 oak +OBT:002154 oat and primary derivative thereof +OBT:002536 oat as food +OBT:003423 oat beverage +OBT:003454 oat milk +OBT:002919 oat plant +OBT:000769 obligate aerobe +OBT:000770 obligate anaerobe +OBT:001233 obligate halophile +OBT:001667 obligate methanotroph +OBT:000771 obligate parasite +OBT:002827 obligate phytopathogen +OBT:000772 obligate piezophile +OBT:002730 ocean trench +OBT:002155 octopus and product thereof +OBT:001668 oesophagus +OBT:001234 offal and product thereof +OBT:000773 office +OBT:001669 offshore oil industry +OBT:001670 offspring +OBT:003006 oil contaminated sediment +OBT:003389 oil contaminated soil +OBT:001235 oil field +OBT:002828 oil field water +OBT:001236 oil industry +OBT:003007 oil mill wastewater +OBT:001671 oil pipeline +OBT:001672 oil reservoir +OBT:001673 oil seep +OBT:002156 oil sludge +OBT:001674 oil spill +OBT:000774 oil tanker +OBT:001237 oil well +OBT:001238 oilfruit and primary derivative thereof +OBT:001675 oil-water separator +OBT:001676 okpehe +OBT:002537 okra +OBT:002157 okra and related product +OBT:003108 oleaginous seed based drink +OBT:002158 oleander +OBT:000361 oligotrophic +OBT:002538 oligotrophic water +OBT:003008 olive +OBT:003109 olive and primary derivative thereof +OBT:003158 olive as food +OBT:003536 olive oil +OBT:002159 olive tree +OBT:001677 omasum +OBT:002539 onion +OBT:002160 onion and related product +OBT:003159 onion plant +OBT:000362 oomycete +OBT:001678 open pit mine +OBT:003268 open skin wound +OBT:000775 open-ocean +OBT:000363 operating room +OBT:002161 opossum +OBT:000776 opportunistic pathogen +OBT:001239 oral thermometer +OBT:003009 orange +OBT:002162 orange and primary derivative thereof +OBT:002540 orange as food +OBT:003585 orange juice +OBT:001240 orange pigmented +OBT:001679 orange storehouse +OBT:002163 orange tree +OBT:002541 oregano +OBT:002164 oregano and related product +OBT:000364 organ +OBT:003345 organic compound contaminated soil +OBT:001680 organic leachate +OBT:000365 organic matter +OBT:000777 organic waste +OBT:000778 organotroph +OBT:001681 ornemental tree +OBT:002327 Ornithodoros moubata +OBT:002328 Ornithodoros turicatae +OBT:000779 ornithogenic soil +OBT:000366 oropharynx +OBT:001392 Osmitopsis asteriscoides +OBT:000367 osmophile +OBT:000368 osmophobe +OBT:000369 osmotactic +OBT:000370 osmotolerant +OBT:000780 osteolytic bone lesion +OBT:002731 ostrich meat +OBT:003110 otter +OBT:000781 outdoor air +OBT:000782 outer ear +OBT:000783 ovary +OBT:001682 ovary wall +OBT:001241 oven +OBT:001683 ovule +OBT:001684 ovule part +OBT:002920 oxacillin resistant +OBT:000371 oxidase activity +OBT:000784 oxidase negative +OBT:000785 oxidase positive +OBT:000123 oxidoreductase activity +OBT:002165 oyster and product thereof +OBT:000786 pacemaker +OBT:000787 packaging +OBT:001685 packaging clean room +OBT:001686 packaging factory +OBT:001687 packed food +OBT:000788 packed lunch +OBT:002542 paddle fish meat +OBT:002166 paella +OBT:003372 PAH contaminated soil +OBT:000789 paired +OBT:001688 paisa mire +OBT:001689 palagonite rind +OBT:001690 palisade layer +OBT:002543 palm heart +OBT:002167 palm heart and related product +OBT:003065 palm oil +OBT:000790 pancreas +OBT:000791 pancreatic duct +OBT:001242 panda +OBT:003219 pantothenate enriched soil +OBT:001243 panty liner +OBT:003010 papaya +OBT:000792 paper +OBT:000793 paper carton +OBT:001244 paper gown +OBT:002168 paper manufacture +OBT:001691 paper mill +OBT:001692 paper mill sludge +OBT:000794 paper pulp +OBT:001245 paper towel +OBT:002544 parakeet +OBT:000372 parasite +OBT:001693 parasitic nematode +OBT:001246 parenchyma +OBT:003514 Parmesan +OBT:002169 parrot +OBT:002545 parsley +OBT:002170 parsley and related product +OBT:002546 parsnip +OBT:002171 parsnip and related product +OBT:000373 part of food +OBT:000014 part of living organism +OBT:003066 partridge +OBT:000795 passenger train +OBT:002172 passerine bird +OBT:001694 pasta and related product +OBT:001695 pasta based dish +OBT:002173 pasteurized food +OBT:002174 pastry product +OBT:000374 pasture +OBT:000375 pathogen +OBT:003220 patient +OBT:003269 patient with infectious disease +OBT:003348 PCB contaminated soil +OBT:000975 PCE contaminated site +OBT:002175 pea +OBT:001696 pea and related product +OBT:001247 pea family +OBT:003298 pea plant +OBT:003011 peach +OBT:002176 peach and primary derivative thereof +OBT:002547 peach as food +OBT:001697 peanut and primary derivative thereof +OBT:002177 peanut as food +OBT:003362 peanut beverage +OBT:002178 peanut butter +OBT:003390 peanut milk +OBT:003012 pear +OBT:002179 pear and primary derivative thereof +OBT:002548 pear as food +OBT:002549 pear tomato +OBT:002180 pear tree +OBT:003111 peat +OBT:001698 peat cut +OBT:001699 peat swamp forest +OBT:001700 peatland +OBT:002181 pecan +OBT:001701 pecan and primary derivative thereof +OBT:002829 penicillin resistant +OBT:002830 penicillin sensitive +OBT:002831 penicillin tolerant +OBT:002182 pepper +OBT:001248 peptic ulcer +OBT:002550 perch meat +OBT:000796 perchlorate-contaminated site +OBT:001702 pericarp +OBT:002921 pericycle +OBT:001249 perineal abscess +OBT:000376 perineum +OBT:001250 periodontal abscess +OBT:000797 periodontal lesion +OBT:001703 periodontal pocket +OBT:001704 periodontium +OBT:001251 peripheral nervous system +OBT:000798 periphytic +OBT:000124 periphyton +OBT:001252 perirectal area +OBT:000377 peritoneal cavity +OBT:000799 peritoneal fluid +OBT:001253 peritoneum +OBT:003221 peritrophic membrane +OBT:002551 permafrost +OBT:001705 permafrost sediment +OBT:000800 pest insect +OBT:003160 pesticide enriched soil +OBT:001706 pesto +OBT:001254 pet +OBT:003112 pet bird +OBT:003222 pet rabbit +OBT:003113 pet turtle +OBT:001707 petal +OBT:001708 petiole +OBT:003067 petite positive +OBT:003564 petit-suisse +OBT:001709 petrochemical factory +OBT:001710 petroleum refinery +OBT:001711 petroleum reservoir +OBT:000801 phagocyte +OBT:002183 phagocytosis resistant +OBT:000802 pharyngeal mucosa +OBT:002832 pharynx +OBT:003068 pheasant +OBT:002552 pheasant meat +OBT:003424 phenanthrene contaminated soil +OBT:000803 phenol +OBT:000015 phenotype wrt adhesion +OBT:000378 phenotype wrt antimicrobial impact +OBT:000379 phenotype wrt carbon source +OBT:001712 phenotype wrt chemical composition +OBT:000380 phenotype wrt color +OBT:000016 phenotype wrt community behaviour +OBT:000125 phenotype wrt disease impact +OBT:000381 phenotype wrt drug impact +OBT:000126 phenotype wrt energy source +OBT:000017 phenotype wrt environment +OBT:000018 phenotype wrt growth +OBT:000127 phenotype wrt habitat acidity +OBT:000128 phenotype wrt habitat osmolarity +OBT:000129 phenotype wrt habitat oxygen +OBT:000382 phenotype wrt habitat salinity +OBT:000130 phenotype wrt habitat temperature +OBT:000131 phenotype wrt immune system impact +OBT:000383 phenotype wrt living energy source +OBT:000019 phenotype wrt metabolic activity +OBT:000384 phenotype wrt metal concentration impact +OBT:000132 phenotype wrt microbial-host interaction +OBT:000133 phenotype wrt molecule impact +OBT:000020 phenotype wrt morphology +OBT:000021 phenotype wrt motility +OBT:000134 phenotype wrt nutrient abundance +OBT:000022 phenotype wrt ploidy +OBT:000135 phenotype wrt pressure +OBT:000136 phenotype wrt radiation impact +OBT:000385 phenotype wrt reducing equivalent source +OBT:000137 phenotype wrt relative humidity impact +OBT:000386 phenotype wrt shape +OBT:000023 phenotype wrt stress +OBT:000138 phenotype wrt temperature impact +OBT:000024 phenotypre wrt genetic +OBT:001713 phloem +OBT:002553 photic zone +OBT:002922 photoautotroph +OBT:002923 photoheterotroph +OBT:002924 photolithoautotroph +OBT:003299 photoorganoheterotroph +OBT:000804 photosynthetic +OBT:000387 phototactic +OBT:000805 phototroph +OBT:000806 phylloplane +OBT:000807 phylloplane part +OBT:000388 phyllosphere +OBT:000389 phyllosphere part +OBT:000025 physiological phenotype +OBT:002732 phytopathogen +OBT:001714 phytoplankton +OBT:002184 pickled food +OBT:002554 pickles +OBT:002555 pico de gallo +OBT:002185 pie +OBT:000390 piezophile +OBT:000391 piezosensitive +OBT:000392 piezotolerant +OBT:003337 pig +OBT:000808 pig farm +OBT:003161 pig manure +OBT:002186 pigeon +OBT:002557 pigeon meat +OBT:000809 pigmented +OBT:002556 pig's trotter +OBT:000810 pillow as equipment +OBT:002187 pine +OBT:001715 pine forest +OBT:001716 pine forest humus +OBT:002925 pine litter +OBT:002188 pineapple and primary derivative thereof +OBT:002558 pineapple as food +OBT:003586 pineapple juice +OBT:001255 pink pigmented +OBT:001256 piping system +OBT:002189 pistachio +OBT:001717 pistachio and primary derivative thereof +OBT:003363 pistachio beverage +OBT:003391 pistachio milk +OBT:003114 pizza +OBT:000811 placenta +OBT:000026 planet +OBT:001257 plankton +OBT:000139 planktonic +OBT:000393 plant +OBT:003013 plant based drink +OBT:003530 plant based juice +OBT:002733 plant commensal +OBT:002190 plant cutting +OBT:002833 plant facultative symbiont +OBT:000140 plant habitat +OBT:000394 plant hosted +OBT:002834 plant litter +OBT:001258 plant material +OBT:000812 plant nodule +OBT:002835 plant obligate symbiont +OBT:000813 plant opportunist +OBT:000814 plant organ +OBT:000395 plant part +OBT:000815 plant product and primary derivative thereof +OBT:001718 plant residue +OBT:002734 plant symbiont +OBT:000816 plant tissue +OBT:001259 plantlet +OBT:000817 pleomorphic +OBT:003014 pleural empyema +OBT:000818 pleural fluid +OBT:003015 plum +OBT:002191 plum and primary derivative thereof +OBT:003223 plum as food +OBT:002559 plum tomato +OBT:002192 plum tree +OBT:002836 plumule +OBT:000819 pneumonic lesion +OBT:002193 poached food +OBT:001719 pocosin +OBT:001720 polar sea ice +OBT:002194 pollen +OBT:001721 pollen tube +OBT:000141 polluted environment +OBT:003323 polluted seawater +OBT:001722 pome fruit and primary derivative thereof +OBT:001260 pond +OBT:001261 pond water +OBT:003515 Pont l'Évêque +OBT:002195 poppy seed +OBT:001723 poppy seed and primary derivative thereof +OBT:000396 pore forming +OBT:000820 porifera +OBT:002196 pork +OBT:002560 pork chop +OBT:003270 portobello mushroom +OBT:000821 positive aerotactic +OBT:000822 positive chemotactic +OBT:001262 posterior intestinal content +OBT:001724 potato +OBT:002197 potato and related product +OBT:002561 potato as food +OBT:001725 potato based dish +OBT:001726 potato chip +OBT:001263 potato silage +OBT:000823 potluck +OBT:003300 poultry +OBT:002562 poultry deep litter +OBT:000824 poultry farm +OBT:001727 poultry house +OBT:002198 poultry litter +OBT:002199 poultry meat +OBT:001728 powdered food +OBT:001264 power plant +OBT:003595 pozol +OBT:003069 pozole +OBT:002563 prawn +OBT:002200 prawn and product thereof +OBT:003493 pregnant woman +OBT:000397 prepared food +OBT:001265 prepared meat +OBT:000825 prepuce +OBT:000142 presence of nanotube +OBT:000143 presence of quorum sensing +OBT:001266 preserved food +OBT:001267 pressed food +OBT:001729 pressure treated food +OBT:002201 primate +OBT:001268 primate part +OBT:000826 probe +OBT:000827 probiotic +OBT:002837 probiotic feed +OBT:001269 probiotic food +OBT:002202 processed cheese +OBT:000828 processed commodity and food +OBT:002838 produced water +OBT:002926 produced water from an oil well +OBT:002927 produced water of an oil reservoir +OBT:001730 progeny +OBT:003016 propolis +OBT:000829 prostate +OBT:000830 prosthetic joint +OBT:003324 prostitute +OBT:000398 prothesis +OBT:000399 prototroph +OBT:000400 protozoa +OBT:000144 pseudohyphae growth +OBT:000401 psychrophile +OBT:000402 psychrotrophic +OBT:002203 public bathing facility +OBT:000403 public building +OBT:000145 public equipment +OBT:001270 public house +OBT:002564 public sauna +OBT:000404 public toilet +OBT:001271 public toilet seat +OBT:000405 public transport +OBT:002204 pudding +OBT:002565 puff pastry +OBT:002928 pulp-bleaching waste water +OBT:002566 pumpkin +OBT:002205 pumpkin and related product +OBT:002206 pumpkin seed +OBT:001731 pumpkin seed and primary derivative thereof +OBT:001732 pupa +OBT:003224 puparia +OBT:001272 purple pigmented +OBT:000831 pus +OBT:002839 pyrazinamide resistant +OBT:003017 pyritic acid mine drainage +OBT:003325 quail +OBT:002207 quail egg +OBT:002567 quail meat +OBT:002568 quark +OBT:001273 quarry +OBT:002569 queso blanco +OBT:002570 queso fresco +OBT:001733 quiche +OBT:003392 quinate enriched soil +OBT:002208 quince and primary derivative thereof +OBT:002571 quince as food +OBT:002209 quinoa +OBT:001734 quinoa and primary derivative thereof +OBT:003364 quinoa beverage +OBT:000832 quinoa feed +OBT:003393 quinoa milk +OBT:002840 quinolone resistant +OBT:002572 rabbit +OBT:002210 rabbit meat +OBT:002211 rabbit tick +OBT:003480 Raclette +OBT:002212 radiation resistant +OBT:002213 radiation sensitive +OBT:002214 radiation tolerant +OBT:002841 radicle +OBT:002929 radioactive sediment +OBT:002573 radish +OBT:002215 radish and related product +OBT:002216 radish as food +OBT:002217 radish plant +OBT:002735 rainwater +OBT:001274 rainwater tank +OBT:003271 rainwater treatment utility +OBT:001735 raised mire +OBT:000833 rash +OBT:003018 raspberry +OBT:002218 raspberry and primary derivative thereof +OBT:002574 raspberry as food +OBT:002736 rat +OBT:002575 rat flea +OBT:002219 ratatouille +OBT:002576 ratite meat +OBT:001736 raw dough +OBT:001275 raw meat +OBT:001276 raw milk +OBT:000834 raw primary commodities +OBT:001277 raw seafood +OBT:002577 ray meat +OBT:000835 ready made meal +OBT:001278 ready-to-eat meal +OBT:003516 Reblochon +OBT:000406 recreational fishing fish pond +OBT:001737 rectal swab +OBT:001279 rectal thermometer +OBT:001738 rectum +OBT:001739 red clay +OBT:003425 red deer +OBT:002737 red deer meat +OBT:001280 red pigmented +OBT:002220 redcurrant and primary derivative thereof +OBT:002578 redcurrant as food +OBT:001740 red-pigmented bacteriome +OBT:001741 reef +OBT:000836 reef surface biofilm +OBT:001281 refinery +OBT:000837 refrigerator +OBT:002221 reheated food +OBT:003426 reindeer +OBT:002842 rennet +OBT:001282 reptile +OBT:001742 reptile and product thereof +OBT:000838 research and study center +OBT:003338 researcher +OBT:000839 residential carpet +OBT:000840 residential toilet +OBT:002843 resistance to amikacin +OBT:002844 resistance to cotrimoxazole +OBT:002845 resistance to dalfopristin +OBT:002846 resistance to imipenem +OBT:002847 resistance to levofloxacin +OBT:002848 resistance to quinupristin +OBT:000841 respiratory therapy equipment +OBT:000407 respiratory tract +OBT:000408 respiratory tract part +OBT:001283 restaurant +OBT:001743 reticulum +OBT:000409 rheotactic +OBT:003019 rhizoplane +OBT:000410 rhizosphere +OBT:000411 rhizosphere part +OBT:002579 rhubarb +OBT:002222 rhubarb and related product +OBT:002580 rice +OBT:002223 rice and primary derivative thereof +OBT:001744 rice based dish +OBT:003427 rice beverage +OBT:001745 rice chip +OBT:000842 rice feed +OBT:003455 rice milk +OBT:002581 rice paddy +OBT:002930 rice plant +OBT:002224 rice pudding +OBT:001284 rice silage +OBT:002582 rice straw +OBT:003568 rice vinegar +OBT:002583 rice waste +OBT:002225 rice-plant residue +OBT:002584 rice-straw residue +OBT:003456 ricotta +OBT:002849 rifampin resistant +OBT:002226 right arm +OBT:003428 ripened cheese +OBT:002227 ripened sausage +OBT:003301 ripening room +OBT:001285 river +OBT:002585 river sediment +OBT:002931 river water +OBT:002586 roach meat +OBT:000843 road +OBT:000844 road junction +OBT:000412 road part +OBT:000845 road side +OBT:003162 roadside soil +OBT:001746 roasted coffee bean +OBT:002228 roasted food +OBT:000846 rock +OBT:000847 rock scraping +OBT:002587 rocket +OBT:002229 rocket and related product +OBT:002230 rodent +OBT:000413 rodent nest +OBT:000848 rod-shaped +OBT:002738 roe deer meat +OBT:003527 romadur +OBT:003517 Romano +OBT:000414 room +OBT:000415 room floor +OBT:002850 root +OBT:001747 root and tuber vegetable +OBT:001286 root cap +OBT:003020 root cortex +OBT:001287 root cortex part +OBT:003070 root endodermis +OBT:000000 root for extraction +OBT:003272 root hair +OBT:002932 root nodule +OBT:000849 root part +OBT:001288 ropy colony +OBT:003518 Roquefort +OBT:002588 rosemary +OBT:002231 rosemary and related product +OBT:001748 rotting hay +OBT:000850 round-shaped +OBT:003071 royal jelly +OBT:002232 rum +OBT:001749 rumen +OBT:002589 ruminant +OBT:001289 ruminant digestive system part +OBT:000146 rural area +OBT:002590 rye +OBT:002233 rye and primary derivative thereof +OBT:000851 rye feed +OBT:001750 rye grass silage +OBT:000474 Saccharomyces cerevisiae starter +OBT:002591 sage +OBT:002234 sage and related product +OBT:003519 Saint-Nectaire +OBT:003520 Saint-Paulin +OBT:002235 sake +OBT:002739 sake brewery +OBT:003563 salami +OBT:001393 Salicornioideae +OBT:003225 salicylate enriched soil +OBT:002236 saline brine sediment +OBT:003226 saline lake +OBT:003072 saline marsh +OBT:001751 saline sediment +OBT:002851 saline wastewater +OBT:002592 saline water +OBT:001290 saline wetland +OBT:001291 saliva +OBT:001752 salivary gland +OBT:000416 salivary sediment +OBT:003021 salmon +OBT:003394 salmon egg +OBT:002593 salmon meat +OBT:002933 salmonides +OBT:002594 salsify +OBT:002237 salsify and related product +OBT:001753 salt +OBT:003115 salt contaminated soil +OBT:000852 salt crust +OBT:001754 salt lake mud +OBT:002595 salt pork +OBT:003163 salt stressed soil +OBT:002238 salted food +OBT:002740 salted lake sediment +OBT:003164 saltern +OBT:003227 saltern crystallizer pond +OBT:001755 salt-preserved food +OBT:001292 salt-tolerant plant +OBT:001293 sand +OBT:002852 sand aquifer +OBT:000853 sandstone +OBT:000854 sandstone monument +OBT:001756 sandwich +OBT:003228 sandwich bread +OBT:001294 sandy beach +OBT:001295 sandy bulk soil +OBT:001757 sandy sediment +OBT:000855 sandy soil +OBT:001296 sanitary towel +OBT:001297 sap +OBT:003326 saprophytic +OBT:003521 Sapsago +OBT:003302 sardine +OBT:002596 sardine meat +OBT:002239 sashimi +OBT:003022 sauce +OBT:003165 sauerkraut +OBT:000856 sauna +OBT:001758 sausage +OBT:000857 savannah +OBT:001759 saw mill +OBT:001298 sawmill +OBT:002853 scald +OBT:002240 scalded food +OBT:003457 schabziger +OBT:000858 school +OBT:000859 school bus +OBT:003327 scientist +OBT:003522 Scimudin +OBT:001760 scleractinian coral +OBT:001299 sclerenchyma +OBT:000860 scratch +OBT:000861 scrub as clothing +OBT:002241 sea cucumber and product thereof +OBT:001300 sea ice +OBT:002242 sea pineapple and product thereof +OBT:002243 sea salt +OBT:001761 sea sand +OBT:003273 sea urchin roe +OBT:000862 seabed +OBT:001301 seafood and seafood product +OBT:001762 seafood based dish +OBT:001763 seagrass +OBT:003073 seal +OBT:002934 seaweed +OBT:001302 sebum +OBT:000863 secretion +OBT:001303 sediment +OBT:002935 sediment contaminated by organic pollutants +OBT:001764 sedimentation pond +OBT:001765 seed +OBT:002244 seed eating bird +OBT:001766 seed part +OBT:001304 seedling +OBT:001305 self-heated organic material +OBT:001306 semen +OBT:003458 semi soft cheese +OBT:000864 sensor +OBT:001767 sepal +OBT:003116 septic tank +OBT:003395 sesame beverage +OBT:003429 sesame milk +OBT:002245 sesame oil +OBT:002246 sesame seed +OBT:001768 sesame seed and primary derivative thereof +OBT:000417 sessile +OBT:001307 sewage +OBT:002936 sewage disposal plant +OBT:001308 sewage oxidation pond +OBT:002854 sewage pipe +OBT:002855 sewage plant +OBT:001309 sewage sludge +OBT:003396 sewer +OBT:003365 sewerage system +OBT:000147 sexual mating +OBT:000148 sexual reproduction +OBT:002597 shad meat +OBT:001310 shale sandstone +OBT:002598 shallot +OBT:002247 shallot and related product +OBT:002937 shallow coastal aquifer +OBT:001769 shallow pond +OBT:001311 shampoo +OBT:002599 shark meat +OBT:002248 sharpshooter +OBT:001312 shaving cream +OBT:002938 shea cake digester +OBT:000865 sheathed +OBT:003531 sheep +OBT:000418 ship +OBT:000866 ship ballast +OBT:000867 ship ballast water +OBT:000419 ship hull +OBT:001313 ship scrapping waste +OBT:000868 ship tank +OBT:000420 shoe +OBT:001770 shoe factory +OBT:002741 shoot apex +OBT:001314 shore +OBT:001315 shoreline +OBT:000869 shower +OBT:000421 shower aerosol +OBT:002856 shower curtain +OBT:003229 shower head +OBT:000422 shrimp culture pond +OBT:000870 shrimp hatchery +OBT:001771 shrub +OBT:001772 sigmoid colon +OBT:000871 silage +OBT:001316 silcone rubber voice prothesis +OBT:002600 siliceous ooze +OBT:000423 silo +OBT:001317 silt +OBT:001318 silver pigmented +OBT:002249 simmered food +OBT:000872 singled +OBT:000873 sink drain +OBT:000874 site contaminated with organic compound +OBT:001319 skeleton +OBT:000875 skin +OBT:003230 skin abscess +OBT:000876 skin bump +OBT:002742 skin lesion +OBT:000877 skin nodule +OBT:000878 skin papule +OBT:000424 skin part +OBT:001320 skin ulcer +OBT:003231 skin wound +OBT:002250 slash pine forest +OBT:002251 slaughter plant +OBT:003328 slaughterer +OBT:001321 slaughtering product +OBT:002252 slaughtering waste +OBT:001322 slaugterhouse equipment +OBT:000425 slough +OBT:001773 sludge +OBT:000426 sludge blanket reactor +OBT:002939 sludge digester +OBT:000879 small colony +OBT:003303 small intestine +OBT:002743 small ruminant +OBT:003494 smear ripened cheese +OBT:002601 smelt meat +OBT:001774 smoked food +OBT:000880 smooth colony +OBT:001775 smooth cord grass +OBT:001323 snack +OBT:001776 snail +OBT:002253 snail product +OBT:001777 snake +OBT:002254 snake product +OBT:000881 snow +OBT:001324 soap +OBT:002255 soap scum +OBT:001778 soda +OBT:002940 soda lake +OBT:002744 soda lake sediment +OBT:000882 sofa as furniture +OBT:003459 soft cheese +OBT:001779 soft drink +OBT:002256 soft tick +OBT:000883 soft tissue +OBT:001325 soft tissue abscess +OBT:000427 soil +OBT:003117 soil contaminated with agricultural activity +OBT:003118 soil contaminated with industrial xenobiotic compound +OBT:003430 soil contaminated with used engine oil +OBT:000884 soil crust +OBT:000885 soil matter +OBT:003232 soil of roadside tree +OBT:000428 soil part +OBT:002257 soil with chemical property +OBT:002258 soil with physical property +OBT:001780 solar lake +OBT:003233 solar saltern +OBT:003339 soldier +OBT:002602 sole meat +OBT:000429 solfatara +OBT:001326 solfataric soil +OBT:001327 solid agricultural waste +OBT:003166 sorbet +OBT:000886 sorghum feed +OBT:001328 soup +OBT:003023 sour anaerobic digester +OBT:003397 sour cream +OBT:003234 sour milk +OBT:003398 sourdough +OBT:002745 sourdough bread +OBT:003537 sourdough starter +OBT:003340 sow +OBT:003366 soy beverage +OBT:003399 soy milk +OBT:003570 soy sauce +OBT:002259 soybean +OBT:001781 soybean and related product +OBT:003304 soybean plant +OBT:002603 spa +OBT:001782 spacecraft assembly clean room +OBT:003523 Spalen +OBT:002857 spectinomycin-resistant +OBT:003431 spelt beverage +OBT:003460 spelt milk +OBT:002746 spelt product +OBT:001783 sphagnum bog +OBT:001784 spice +OBT:001785 spider +OBT:002604 spinach +OBT:002260 spinach and related product +OBT:000887 spinal cord +OBT:000888 spiral-shaped +OBT:001329 spirillum +OBT:002261 spirit +OBT:001330 spirochete +OBT:000889 spirometer +OBT:000890 spleen +OBT:001331 splenic abcess +OBT:000430 spoiled food +OBT:001332 sponge +OBT:002262 spoon +OBT:000149 spore +OBT:000150 sporulating +OBT:001333 spring +OBT:003346 spring high in sulfide +OBT:002605 spring sediment +OBT:001334 sprout +OBT:002263 spruce +OBT:001335 sputum +OBT:000431 sputum sediment +OBT:000891 square-shaped +OBT:003074 squid +OBT:002264 squid and product thereof +OBT:002606 squirrel +OBT:002265 stable manure +OBT:000892 stalked +OBT:001786 stall +OBT:001787 star coral +OBT:000893 star-shaped +OBT:000432 starter culture +OBT:000894 starter yeast +OBT:002266 steak tartare +OBT:002267 steamed food +OBT:002941 stem +OBT:003075 stem cortex +OBT:001788 stem cortex part +OBT:003119 stem endodermis +OBT:003076 stem epidermis +OBT:003024 stem nodule +OBT:001336 stem part +OBT:001789 stem vegetable +OBT:000433 sterile clean room +OBT:000151 sterile water +OBT:002268 sterilized food +OBT:001337 stew +OBT:002269 stewed food +OBT:001790 stigma +OBT:003524 Stilton +OBT:001791 stink bug +OBT:001792 stomach +OBT:000895 stomach content +OBT:001793 stomach mucosa +OBT:001794 stomach ulcer +OBT:001795 stomata +OBT:001796 stone fruit and primary derivative thereof +OBT:000896 stool as furniture +OBT:002270 storage box +OBT:000434 storage equipment +OBT:003400 storm drain +OBT:002747 stormwater +OBT:001797 stratified lake +OBT:003167 stratified marine water column +OBT:001338 stratified water +OBT:000435 stratosphere +OBT:002271 straw +OBT:003025 strawberry +OBT:002272 strawberry and primary derivative thereof +OBT:002607 strawberry as food +OBT:002608 stream sediment +OBT:002942 stream water +OBT:003432 streched curd cheese +OBT:002858 streptomycin resistant +OBT:000152 stress resistant +OBT:000153 stress sensitive +OBT:000154 stress tolerant +OBT:002273 striploin +OBT:000155 stromatolite +OBT:000897 structured colony +OBT:003305 student +OBT:002609 sturgeon meat +OBT:001798 style +OBT:001339 subcutaneous abscess-like lesion +OBT:001799 subgingiva +OBT:000898 subgingival biofilm +OBT:001800 subgingival plaque +OBT:003168 submarine +OBT:001801 submarine basalt +OBT:002274 submarine glassy basalt +OBT:003554 submarine hotspring +OBT:003026 submarine hydrocarbon seep +OBT:003274 submarine thermal spring +OBT:003169 submarinegroundwater discharge +OBT:002859 submersed aquatic plant +OBT:000156 subterrestrial habitat +OBT:000436 subtropical area +OBT:000899 subway +OBT:001340 sugar +OBT:002860 sugar cane +OBT:002610 sugar cane field +OBT:003544 sugar cane juice +OBT:001802 sugar confectionery +OBT:002275 sugar factory +OBT:001803 sugar-beet +OBT:003170 sugar-beet refinery +OBT:001804 sugar-preserved food +OBT:002861 sulfamethoxazole resistant +OBT:002862 sulfate-rich wastewater +OBT:002863 sulfide mound +OBT:000900 sulfide-oxidizing bioreactor +OBT:000437 sulfide-rich environment +OBT:002611 sulfide-rich freshwater sediment +OBT:003367 sulfide-rich hot spring +OBT:003077 sulfide-rich water +OBT:003027 sulfide-saturated mud sediment +OBT:002612 sulfidic coastal sediment +OBT:000901 sulfidogenic bioreactor +OBT:003028 sulfur oxidizing +OBT:003401 sunflower beverage +OBT:003433 sunflower milk +OBT:003078 sunflower oil +OBT:002276 sunflower seed +OBT:001805 sunflower seed and primary derivatives thereof +OBT:001806 supragingival plaque +OBT:001341 surface of cheese +OBT:000902 surface of food +OBT:001807 surface sediment +OBT:001342 surface smear +OBT:000903 surface soil +OBT:000904 surface water +OBT:002613 surgery +OBT:000905 surgical cap +OBT:002614 surgical device +OBT:000438 surgical drain +OBT:001343 surgical gown +OBT:002277 surimi +OBT:002864 susceptibility to azithromycin +OBT:002865 susceptible to dalfopristin +OBT:002866 susceptible to quinupristin +OBT:003171 sushi +OBT:001344 swamp +OBT:000439 swarming +OBT:001345 sweat +OBT:002615 swede +OBT:002278 swede and related product +OBT:001808 sweet clover +OBT:002616 sweet corn +OBT:002279 sweet corn and related product +OBT:002280 sweet dough +OBT:002617 sweet pepper +OBT:002281 sweet pepper and related product +OBT:002618 sweet potato +OBT:002282 sweet potato and related product +OBT:000440 swimming +OBT:002619 swimming pool +OBT:003329 swine +OBT:001809 swine house +OBT:000441 symbiont +OBT:000442 symbiosome +OBT:000906 symbiotic diazotroph +OBT:000907 synovial fluid +OBT:000443 syntrophic +OBT:000444 syringe +OBT:000908 table as furniture +OBT:002620 table grape as food +OBT:003172 table olive +OBT:000909 tableware +OBT:001346 tail +OBT:001347 take-away restaurant +OBT:003525 Taleggio +OBT:001348 tampon +OBT:001349 tank water +OBT:001810 tannery +OBT:001811 tannery sludge +OBT:002943 tannery wastewater +OBT:000910 tap +OBT:002283 tap water +OBT:002621 tarragon +OBT:002284 tarragon and related product +OBT:000976 Tasmanian devil +OBT:000157 taxis phenotype +OBT:003368 tea infusion +OBT:001812 tea tree +OBT:001350 tears +OBT:001351 teat +OBT:001352 teat canal +OBT:002622 teff +OBT:002285 teff and primary derivative thereof +OBT:003434 teff beverage +OBT:003461 teff milk +OBT:002867 teicoplanin resistant +OBT:002868 teicoplanin sensitive +OBT:003601 tejuino +OBT:000445 temperate zone +OBT:001353 temperature sensor +OBT:002286 tenderloin +OBT:000911 terminal airway +OBT:001813 terminal ileum +OBT:001814 termite +OBT:001354 terrestial wetland +OBT:001815 terrestrial crustacean +OBT:000158 terrestrial habitat +OBT:001816 terrestrial invertebrate product +OBT:000446 terrestrial landscape +OBT:000912 terrestrial plant +OBT:002287 testa +OBT:002288 testa part +OBT:000913 testis +OBT:002869 tetracycline resistant +OBT:000914 tetrad +OBT:000159 tetraploid +OBT:000447 textile +OBT:001817 textile dye effluent +OBT:001818 textile industry +OBT:002944 textile wastewater +OBT:002329 The Laughing Cow +OBT:002748 theatre shoe +OBT:000448 therapeutic equipment +OBT:000915 therapeutic ultrasound equipment +OBT:000160 thermal area +OBT:001819 thermal power plant +OBT:002945 thermal resistant +OBT:002289 thermal sensitive +OBT:001820 thermal spring +OBT:002290 thermal tolerant +OBT:000916 thermometer as medical device +OBT:000449 thermophile +OBT:002946 thermophilic aerobic digester +OBT:000917 thermophilic anaerobic methanogenic reactor +OBT:000450 thermophilic methanogenic bioreactor +OBT:000451 thermosphere +OBT:000452 thermotactic +OBT:000453 thermotolerant +OBT:000454 thigmotactic +OBT:000918 throat +OBT:000919 throat swab +OBT:002623 thyme +OBT:002291 thyme and related product +OBT:003538 thyme oil +OBT:001821 tick +OBT:001355 tidal creek +OBT:002749 tidal flat sediment +OBT:003029 tidal marsh +OBT:002292 tidal mudflat +OBT:003306 tilapia +OBT:002624 tilapia meat +OBT:003526 Tilsit +OBT:003079 toad +OBT:002870 tobacco plant +OBT:001822 tobacco warehouse +OBT:003235 tobiko +OBT:001823 toe +OBT:000920 toilet +OBT:000921 toilet bowl +OBT:000455 toilet equipment +OBT:000922 toilet seat +OBT:000923 toiletries +OBT:003481 Toma +OBT:003030 tomato +OBT:002293 tomato and related product +OBT:002625 tomato as food +OBT:003587 tomato juice +OBT:001824 tomato ketchup +OBT:002294 tomato plant +OBT:003307 tomato-marinated broiler meat strip +OBT:000924 tomb +OBT:001825 tongue +OBT:001826 tooth +OBT:001356 toothbrush +OBT:001357 toothpaste +OBT:001827 toothpick +OBT:001828 tortoise +OBT:000925 toundra +OBT:000926 towel +OBT:001358 trachea +OBT:003435 traditional sourdough +OBT:000456 transconjugating +OBT:000161 transport and storage equipment +OBT:000162 transport equipment +OBT:001829 transverse colon +OBT:003482 Trappist +OBT:003308 traveler +OBT:000457 traveller luggage +OBT:001359 treated wood +OBT:001360 tree +OBT:000927 tree part +OBT:002871 trichome +OBT:000928 trichome forming +OBT:002872 trimethoprim resistant +OBT:000163 triploid +OBT:002626 triticale +OBT:002295 triticale and primary derivative thereof +OBT:000929 trona crust +OBT:000930 tropical country +OBT:002627 tropical soil +OBT:000458 tropical zone +OBT:000459 troposphere +OBT:003031 trout +OBT:002628 trout meat +OBT:003275 truffle +OBT:002296 truffle and related product +OBT:001361 trunk +OBT:000931 trypanosome +OBT:003173 tsetse fly +OBT:001362 tuber +OBT:000932 tuberculoid granuloma +OBT:002947 tubeworm +OBT:003309 tuna +OBT:002629 tuna meat +OBT:001830 tundra mire +OBT:001831 tunicate and product thereof +OBT:003310 turbot +OBT:002630 turbot meat +OBT:003330 turkey +OBT:002631 turkey meat +OBT:002632 turnip +OBT:002297 turnip and related product +OBT:003588 turnip juice +OBT:001832 turtle +OBT:002298 turtle product +OBT:001363 twig +OBT:000460 twitching +OBT:003436 type I sourdough +OBT:003437 type II sourdough +OBT:003438 type III sourdough +OBT:003347 tzatziki +OBT:000164 ubiquitous +OBT:001364 udder parenchyma +OBT:001855 UHT food +OBT:000933 ulcer +OBT:000934 ulcerative lesion +OBT:000461 ultrasound gel +OBT:003032 unamended soil +OBT:002873 underground mine +OBT:003236 unhopped wort +OBT:001365 university +OBT:000935 upland +OBT:001366 upper gastrointestinal tract part +OBT:000936 upper respiratory tract +OBT:003120 urban soil +OBT:002299 urchin and product thereof +OBT:003237 urea enriched soil +OBT:000165 urea solution +OBT:000937 ureter +OBT:000938 urethra +OBT:000939 urinary catheter +OBT:000940 urinary tract +OBT:001367 urine +OBT:000462 urine sediment +OBT:000941 urogenital tract +OBT:000463 urogenital tract part +OBT:000942 uterus +OBT:002330 UV radiation resistant +OBT:002331 UV radiation sensitive +OBT:002332 UV radiation tolerant +OBT:002659 UVA radiation resistant +OBT:002660 UVA radiation sensitive +OBT:002661 UVA radiation tolerant +OBT:002662 UVB radiation resistant +OBT:002663 UVB radiation sensitive +OBT:002664 UVB radiation tolerant +OBT:002665 UVC radiation resistant +OBT:002666 UVC radiation sensitive +OBT:002667 UVC radiation tolerant +OBT:000464 vaccine +OBT:002300 vacuum-packed food +OBT:000943 vagina +OBT:001368 vaginal abscess +OBT:001369 vaginal secretion +OBT:001833 vaginal swab +OBT:000944 valley +OBT:003483 Valtellina Casera +OBT:002874 vancomycin resistant +OBT:002875 vancomycin sensitive +OBT:002876 vancomycin tolerant +OBT:000945 vascular +OBT:001370 vascular tissue +OBT:001834 vegetable based dish +OBT:003539 vegetable based juice +OBT:001371 vegetable garden soil +OBT:003369 vegetable puree +OBT:002750 vegetarian +OBT:001835 vein +OBT:002633 velveeta +OBT:002634 venison meat +OBT:000946 vertebrate +OBT:000465 vertebrate part +OBT:001836 vestibular mucosa +OBT:003331 veterinarian +OBT:000947 veterinary drug +OBT:002333 Vienna bread +OBT:002301 viennoiserie +OBT:003238 viili +OBT:003565 vinegar +OBT:002751 vinegar factory +OBT:001372 violet pigmented +OBT:001837 vitamin supplement +OBT:000948 vitreous fluid +OBT:000949 vitreous humor +OBT:000166 volcanic area +OBT:000950 volcanic soil +OBT:000951 volcano +OBT:002635 vole +OBT:000952 vomit +OBT:002636 waffle mixture +OBT:000466 wall +OBT:002302 walnut +OBT:001838 walnut and primary derivative thereof +OBT:003370 walnut beverage +OBT:003402 walnut milk +OBT:002303 walnut oil +OBT:001839 war readiness warehouse +OBT:001373 warehouse +OBT:003462 warm coastal water +OBT:003439 warm seawater +OBT:001374 warm-blooded animal +OBT:002637 wasabi +OBT:002304 wasabi and related product +OBT:000953 washbasin +OBT:000954 washing machine +OBT:001840 wasp +OBT:000467 waste +OBT:002877 waste container +OBT:001375 waste food compost +OBT:000167 waste treatment environment +OBT:002752 waste treatment equipment +OBT:002753 waste treatment plant +OBT:002754 waste water +OBT:003545 waste water pipe +OBT:002948 wastewater treatment digester +OBT:003174 wastewater treatment equipment +OBT:003239 wastewater treatment plant +OBT:000468 water +OBT:001841 water based beverage +OBT:001842 water based dish +OBT:001376 water canal +OBT:001377 water column +OBT:000955 water cooling system +OBT:001843 water dispenser +OBT:000956 water droplet +OBT:000957 water from air and water system +OBT:001378 water heater +OBT:000958 water heater system +OBT:001844 water in cooling tower +OBT:001379 water of an humidifier +OBT:003033 water pipe +OBT:003276 water pollution treatment plant +OBT:000959 water storage system +OBT:001380 water system +OBT:001381 water tank +OBT:002949 water tap +OBT:000960 water transport structure +OBT:000961 water treatment plant +OBT:000962 water vapor +OBT:000469 water well +OBT:002639 watercress +OBT:002305 watercress and related product +OBT:002640 waterfowl +OBT:002306 watermelon and related product +OBT:002641 watermelon as food +OBT:002307 watermelon rind +OBT:002638 water-stressed soil +OBT:002878 water-table aquifer +OBT:002308 weatherfish meat +OBT:001845 weed +OBT:003332 welder +OBT:000963 welfare center +OBT:000168 wet environment +OBT:000964 wetland +OBT:002309 wheat and primary derivative thereof +OBT:000965 wheat feed +OBT:002642 wheat field +OBT:002950 wheat plant +OBT:002643 wheat product +OBT:002644 wheat semolina +OBT:003589 wheatgrass juice +OBT:001846 whey +OBT:003440 whey cheese +OBT:000966 whirlpool bath +OBT:002755 whirlpool spa +OBT:002645 white mustard +OBT:002310 white mustard and related product +OBT:002646 white pepper +OBT:001382 white pigmented +OBT:003441 white tail deer +OBT:002647 whitefish meat +OBT:001847 whole egg +OBT:000967 wild animal +OBT:002648 wild boar meat +OBT:002311 wild fish meat +OBT:002649 wild rabbit meat +OBT:000169 wild-type +OBT:003546 wine +OBT:002650 wine grape as food +OBT:000968 wine yeast +OBT:002651 winter savory +OBT:002312 winter savory and related product +OBT:002652 witloof +OBT:002313 witloof and related product +OBT:002653 wolffish meat +OBT:003463 woman +OBT:002654 wombat +OBT:001383 wood +OBT:002314 wood tick +OBT:000969 woody landscape +OBT:003311 worker +OBT:001384 working animal +OBT:003528 working horse +OBT:001385 worm +OBT:003121 wort +OBT:000970 wound +OBT:002655 wuchang bream meat +OBT:003240 xueo +OBT:001848 xylem +OBT:003403 xylene contaminated soil +OBT:002315 yak +OBT:002316 yak milk +OBT:002656 yam +OBT:002317 yam and related product +OBT:002657 yeast +OBT:002756 yeast bread +OBT:003464 yeast bread dough +OBT:000170 yeast extract +OBT:001386 yellow pigmented +OBT:003122 yoghurt from bambara groundnut +OBT:003404 yoghurt from fermented soybean milk +OBT:003241 yogurt +OBT:001387 yolk sac +OBT:003175 young adult +OBT:001849 young animal +OBT:002318 zebra +OBT:002951 zinc- and sulfate-rich wastewater +OBT:001850 zinc factory +OBT:000171 zoo +OBT:000971 zoo animal +OBT:001851 zooplankton diff --git a/mirri/entities/__init__.py b/mirri/entities/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/entities/_private_classes.py b/mirri/entities/_private_classes.py new file mode 100644 index 0000000..4cbf5c1 --- /dev/null +++ b/mirri/entities/_private_classes.py @@ -0,0 +1,45 @@ +class FrozenClass(object): + __isfrozen = False + + def __setattr__(self, key, value): + # print(dir(self)) + if self.__isfrozen and not hasattr(self, key): + msg = f"Can not add {key} to {self.__class__.__name__}. It is not one of its attributes" + raise TypeError(msg) + object.__setattr__(self, key, value) + + def _freeze(self): + self.__isfrozen = True + + +class _FieldBasedClass(FrozenClass): + _fields = [] + + def __init__(self, data=None, freeze=True): + self._data = {} + if data is None: + data = {} + for field in self._fields: + value = data.get(field["label"], None) + setattr(self, field["attribute"], value) + if freeze: + self._freeze() + + def __eq__(self, o: object) -> bool: + for field in self._fields: + val1 = getattr(self, field["attribute"], None) + val2 = getattr(o, field["attribute"], None) + if val1 != val2: + return False + return True + + def __bool__(self): + return bool(self.dict()) + + def dict(self): + data = {} + for field in self._fields: + value = getattr(self, field["attribute"]) + if value is not None: + data[field["label"]] = value + return data diff --git a/mirri/entities/date_range.py b/mirri/entities/date_range.py new file mode 100644 index 0000000..13a1043 --- /dev/null +++ b/mirri/entities/date_range.py @@ -0,0 +1,87 @@ +from calendar import monthrange +from collections import OrderedDict +from copy import copy +from datetime import date + + +class DateRange: + def __init__(self, year=None, month=None, day=None): + self._year = year + if month is not None and (month < 1 or month > 12): + raise ValueError("Month must be between 1 and 12") + self._month = month + if day is not None and (day < 1 or day > 31): + raise ValueError("Day must be between 1 and 31") + self._day = day + + self._start = None + self._end = None + if year or month or day: + self._create_range() + + def __str__(self): + _strdate = self.strfdate + if _strdate is None: + return "" + + return _strdate + + def __bool__(self): + return bool(self._year or self._month or self._day) + + def _create_range(self): + year = self._year + month = self._month + day = self._day + if year and month and day: + start_date = date(year=year, month=month, day=day) + end_date = date(year=year, month=month, day=day) + elif month is None: + start_date = date(year=year, month=1, day=1) + end_date = date(year=year, month=12, day=31) + elif day is None: + month_last_day = monthrange(year, month)[1] + start_date = date(year=year, month=month, day=1) + end_date = date(year=year, month=month, day=month_last_day) + + self._start = start_date + self._end = end_date + + def strpdate(self, date_str: str): + date_str = str(date_str) + orig_date = copy(date_str) + date_str = date_str.replace("/", "").replace("-", "") + if len(date_str) > 8: + msg = f"Malformed date, Mora caracters than expected: {orig_date}" + raise ValueError(msg) + month = None + day = None + if len(date_str) >= 4: + year = int(date_str[:4]) + if len(date_str) >= 6: + month = int(date_str[4:6]) + if month < 1 or month > 12: + raise ValueError("Month must be between 1 and 12") + if len(date_str) >= 8: + day = int(date_str[6:8]) + if day is not None and (day < 1 or day > 31): + raise ValueError("Day must be between 1 and 31") + self._year = year + self._month = month + self._day = day + self._create_range() + return self + + @property + def strfdate(self): + year = "----" if self._year is None else f"{self._start.year:04}" + month = "--" if self._month is None else f"{self._start.month:02}" + day = "--" if self._day is None else f"{self._start.day:02}" + _date = str(f"{year}{month}{day}") + if _date == "--------": + return None + return _date + + @property + def range(self): + return OrderedDict([("start", self._start), ("end", self._end)]) diff --git a/mirri/entities/growth_medium.py b/mirri/entities/growth_medium.py new file mode 100644 index 0000000..79c4a1d --- /dev/null +++ b/mirri/entities/growth_medium.py @@ -0,0 +1,47 @@ +class GrowthMedium: + fields = ['record_id', 'record_name', 'acronym', 'full_description', + 'ingredients', 'description', 'other_name', 'ph', + 'sterilization_conditions'] + + def __init__(self, **kwargs): + self._data = {} + for field in self.fields: + if field in kwargs and kwargs['field'] is not None: + value = kwargs['field'] + setattr(self, field, value) + + def __setattr__(self, attr, value): + if attr == '_data': + super().__setattr__(attr, value) + return + if attr not in self.fields: + raise TypeError(f'{attr} not an allowed attribute') + self._data[attr] = value + + def __getattr__(self, attr): + if attr == '_data': + return super + if attr not in self.fields and attr != '_data': + raise TypeError(f'{attr} not an allowed attribute') + return self._data.get(attr, None) + + def dict(self): + return self._data + + def update(self, growth_media, include_fields=None): + for field in self.fields: + if include_fields and field in include_fields: + new_value = getattr(growth_media, field, None) + actual_value = getattr(self, field, None) + if new_value is not None and new_value != actual_value: + setattr(self, field, new_value) + + def is_equal(self, other, exclude_fields=[]): + for field in self.fields: + if field in exclude_fields: + continue + value_of_other = getattr(other, field, None) + value_of_self = getattr(self, field, None) + if value_of_self is not None and value_of_self != value_of_other: + return False + return True diff --git a/mirri/entities/location.py b/mirri/entities/location.py new file mode 100644 index 0000000..7fa0ec0 --- /dev/null +++ b/mirri/entities/location.py @@ -0,0 +1,170 @@ +from __future__ import annotations +import hashlib +from typing import Union + +from mirri.entities._private_classes import _FieldBasedClass +from mirri.settings import ( + ALTITUDE, + COORD_SPATIAL_REFERENCE, + COORDUNCERTAINTY, + COUNTRY, + GEOREF_METHOD, + ISLAND, + LATITUDE, + LONGITUDE, + MUNICIPALITY, + OTHER, + PROVINCE, + SITE, + STATE, +) +import pycountry + + +class Location(_FieldBasedClass): + _fields = [ + {"attribute": "country", "label": COUNTRY}, + {"attribute": "state", "label": STATE}, + {"attribute": "province", "label": PROVINCE}, + {"attribute": "municipality", "label": MUNICIPALITY}, + {"attribute": "site", "label": SITE}, + {"attribute": "other", "label": OTHER}, + {"attribute": "island", "label": ISLAND}, + {"attribute": "longitude", "label": LONGITUDE}, + {"attribute": "latitude", "label": LATITUDE}, + {"attribute": "altitude", "label": ALTITUDE}, + {"attribute": "coord_spatial_reference", "label": COORD_SPATIAL_REFERENCE}, + {"attribute": "coord_uncertainty", "label": COORDUNCERTAINTY}, + {"attribute": "georef_method", "label": GEOREF_METHOD}, + ] + + def __str__(self): + _site = [] + if self.country: + _site.append(self.country) + if self.province: + _site.append(self.province) + if self.site: + _site.append(self.site) + if self.municipality: + _site.append(self.municipality) + + return ": ".join(_site) + + def __hash__(self): + hash_str = '' + for field in self._fields: + value = str(getattr(self, field, None)) + hash_str += value + # hash_str = str(self.country) + str(self.province) + str(self.municipality) + str(self.site) + return int(hashlib.sha1(hash_str.encode("utf-8")).hexdigest(), 16) % (10 ** 8) + + @property + def country(self) -> Union[str, None]: + return self._data.get(COUNTRY, None) + + @country.setter + def country(self, code3: str): + if code3 is not None: + _country = pycountry.countries.get(alpha_3=code3) + if _country is None: + _country = pycountry.historic_countries.get(alpha_3=code3) + if _country is None and code3 != 'INW': + raise ValueError(f'{code3}, not a valid 3 letter country name') + self._data[COUNTRY] = code3 + + @property + def province(self) -> Union[str, None]: + return self._data.get(PROVINCE, None) + + @province.setter + def province(self, code3: str): + self._data[PROVINCE] = code3 + + @property + def municipality(self) -> Union[str, None]: + return self._data.get(MUNICIPALITY, None) + + @municipality.setter + def municipality(self, name: str): + self._data[MUNICIPALITY] = name + + @property + def site(self) -> Union[str, None]: + return self._data.get(SITE, None) + + @site.setter + def site(self, name: str): + self._data[SITE] = name + + @property + def latitude(self): + return self._data.get(LATITUDE, None) + + @latitude.setter + def latitude(self, latitude: float): + self._data[LATITUDE] = latitude + + @property + def longitude(self) -> Union[float, None]: + return self._data.get(LONGITUDE, None) + + @longitude.setter + def longitude(self, longitude: float): + self._data[LONGITUDE] = longitude + + @property + def altitude(self) -> Union[int, float, None]: + return self._data.get(ALTITUDE, None) + + @altitude.setter + def altitude(self, altitude: Union[int, float]): + self._data[ALTITUDE] = altitude + + @property + def georef_method(self) -> Union[str, None]: + return self._data.get(GEOREF_METHOD, None) + + @georef_method.setter + def georef_method(self, georef_method: str): + self._data[GEOREF_METHOD] = georef_method + + @property + def coord_uncertainty(self) -> Union[str, None]: + return self._data.get(COORDUNCERTAINTY, None) + + @coord_uncertainty.setter + def coord_uncertainty(self, coord_uncertainty: str): + self._data[COORDUNCERTAINTY] = coord_uncertainty + + @property + def coord_spatial_reference(self) -> Union[str, None]: + return self._data.get(COORD_SPATIAL_REFERENCE, None) + + @coord_spatial_reference.setter + def coord_spatial_reference(self, coord_spatial_reference: str): + self._data[COORD_SPATIAL_REFERENCE] = coord_spatial_reference + + @property + def state(self) -> Union[str, None]: + return self._data.get(STATE, None) + + @state.setter + def state(self, state): + self._data[STATE] = state + + @property + def island(self) -> Union[str, None]: + return self._data.get(ISLAND, None) + + @island.setter + def island(self, island): + self._data[ISLAND] = island + + @property + def other(self) -> Union[str, None]: + return self._data.get(OTHER, None) + + @other.setter + def other(self, other): + self._data[OTHER] = other diff --git a/mirri/entities/publication.py b/mirri/entities/publication.py new file mode 100644 index 0000000..c328869 --- /dev/null +++ b/mirri/entities/publication.py @@ -0,0 +1,202 @@ +from mirri.settings import (BOOK_EDITOR, BOOK_PUBLISHER, BOOK_TITLE, + PUB_AUTHORS, PUB_DOI, PUB_FIRST_PAGE, PUB_ID, + PUB_ISSUE, PUB_JOURNAL, PUB_LAST_PAGE, + PUB_PUBMED_ID, PUB_TITLE, PUB_VOLUME) + +# Maybe we could implement some crossref calls to fill all field data +# and get DOI where ever is possible + +RECORD_ID = 'RecordId' +RECORD_NAME = 'RecordName' + + +class Publication: + def __init__(self, data=None): + self._data = {} + if data: + self.record_id = data.get('RecordId', None) + self.record_name = data.get('RecordName', None) + self.pubmed_id = data.get(PUB_PUBMED_ID, None) + self.doi = data.get(PUB_DOI, None) + self.title = data.get(PUB_TITLE, None) + self.authors = data.get(PUB_AUTHORS, None) + self.journal = data.get(PUB_JOURNAL, None) + self.volume = data.get(PUB_VOLUME, None) + self.issue = data.get(PUB_ISSUE, None) + self.first_page = data.get(PUB_FIRST_PAGE, None) + self.last_page = data.get(PUB_LAST_PAGE, None) + self.editor = data.get(BOOK_EDITOR, None) + self.publisher = data.get(BOOK_PUBLISHER, None) + self.book_title = data.get(BOOK_TITLE, None) + self.isbn = data.get('ISBN', None) + self.issn = data.get('ISSN', None) + self.year = data.get('Year', None) + + def __bool__(self): + return bool(self._data) + + def dict(self): + return self._data + + @property + def id(self) -> int: + return self._data.get(PUB_ID, None) + + @id.setter + def id(self, value: int): + if value is not None: + self._data[PUB_ID] = value + + @property + def record_id(self) -> int: + return self._data.get(RECORD_ID, None) + + @record_id.setter + def record_id(self, value: int): + if value is not None: + self._data[RECORD_ID] = value + + @property + def record_name(self) -> int: + return self._data.get(RECORD_NAME, None) + + @record_name.setter + def record_name(self, value: int): + if value is not None: + self._data[RECORD_NAME] = value + + @property + def pubmed_id(self): + return self._data.get(PUB_PUBMED_ID, None) + + @pubmed_id.setter + def pubmed_id(self, value: str): + if value is not None: + self._data[PUB_PUBMED_ID] = value + + @property + def isbn(self): + return self._data.get('ISBN', None) + + @isbn.setter + def isbn(self, value: str): + if value is not None: + self._data['ISBN'] = value + + @property + def issn(self): + return self._data.get('ISSN', None) + + @issn.setter + def issn(self, value: str): + if value is not None: + self._data['ISSN'] = value + + @property + def doi(self): + return self._data.get(PUB_DOI, None) + + @doi.setter + def doi(self, value: str): + if value is not None: + self._data[PUB_DOI] = value + + @property + def title(self): + return self._data.get(PUB_TITLE, None) + + @title.setter + def title(self, value: str): + if value is not None: + self._data[PUB_TITLE] = value + self._data[RECORD_NAME] = value + + @property + def authors(self): + return self._data.get(PUB_AUTHORS, None) + + @authors.setter + def authors(self, value: str): + if value is not None: + self._data[PUB_AUTHORS] = value + + @property + def journal(self): + return self._data.get(PUB_JOURNAL, None) + + @journal.setter + def journal(self, value: str): + if value is not None: + self._data[PUB_JOURNAL] = value + + @property + def volume(self): + return self._data.get(PUB_VOLUME, None) + + @volume.setter + def volume(self, value: str): + if value is not None: + self._data[PUB_VOLUME] = value + + @property + def issue(self): + return self._data.get(PUB_ISSUE, None) + + @issue.setter + def issue(self, value: str): + if value is not None: + self._data[PUB_ISSUE] = value + + @property + def first_page(self): + return self._data.get(PUB_FIRST_PAGE, None) + + @first_page.setter + def first_page(self, value: str): + if value is not None: + self._data[PUB_FIRST_PAGE] = value + + @property + def last_page(self): + return self._data.get(PUB_LAST_PAGE, None) + + @last_page.setter + def last_page(self, value: str): + if value is not None: + self._data[PUB_LAST_PAGE] = value + + @property + def book_title(self): + return self._data.get(BOOK_TITLE, None) + + @book_title.setter + def book_title(self, value: str): + if value is not None: + self._data[BOOK_TITLE] = value + + @property + def editors(self): + return self._data.get(BOOK_EDITOR, None) + + @editors.setter + def editors(self, value: str): + if value is not None: + self._data[BOOK_EDITOR] = value + + @property + def publisher(self): + return self._data.get(BOOK_PUBLISHER, None) + + @publisher.setter + def publisher(self, value: str): + if value is not None: + self._data[BOOK_PUBLISHER] = value + + @property + def year(self) -> int: + return self._data.get('Year', None) + + @year.setter + def year(self, value: int): + if value is not None: + self._data['Year'] = value diff --git a/mirri/entities/sequence.py b/mirri/entities/sequence.py new file mode 100644 index 0000000..814ac06 --- /dev/null +++ b/mirri/entities/sequence.py @@ -0,0 +1,45 @@ +from mirri.entities._private_classes import _FieldBasedClass +from mirri.settings import ( + ALLOWED_MARKER_TYPES, + MARKER_INSDC, + MARKER_SEQ, + MARKER_TYPE) + +from mirri import ValidationError + + +class GenomicSequence(_FieldBasedClass): + _fields = [ + {"attribute": "marker_type", "label": MARKER_TYPE}, + {"attribute": "marker_id", "label": MARKER_INSDC}, + {"attribute": "marker_seq", "label": MARKER_SEQ}, + ] + + @property + def marker_type(self): + return self._data.get(MARKER_TYPE, None) + + @marker_type.setter + def marker_type(self, value: str): + if value is not None: + types = " ".join([m["acronym"] for m in ALLOWED_MARKER_TYPES]) + if value not in types: + msg = f"{value} not in allowed marker types: {types}" + raise ValidationError(msg) + self._data[MARKER_TYPE] = value + + @property + def marker_id(self) -> str: + return self._data.get(MARKER_INSDC, None) + + @marker_id.setter + def marker_id(self, value: str): + self._data[MARKER_INSDC] = value + + @property + def marker_seq(self) -> str: + return self._data.get(MARKER_SEQ, None) + + @marker_seq.setter + def marker_seq(self, value: str): + self._data[MARKER_SEQ] = value diff --git a/mirri/entities/strain.py b/mirri/entities/strain.py new file mode 100644 index 0000000..a9a51e5 --- /dev/null +++ b/mirri/entities/strain.py @@ -0,0 +1,1243 @@ +""" +Created on 2020(e)ko abe. 1(a) + +@author: peio +""" +from __future__ import annotations + +import re +from collections import OrderedDict +from copy import deepcopy +from typing import List, Union + +import pycountry + +from mirri import ValidationError +from mirri.entities._private_classes import _FieldBasedClass, FrozenClass +from mirri.entities.date_range import DateRange +from mirri.entities.location import Location +from mirri.entities.publication import Publication +from mirri.entities.sequence import GenomicSequence +from mirri.settings import ( + ABS_RELATED_FILES, + ACCESSION_NAME, + ACCESSION_NUMBER, + ALLOWED_FORMS_OF_SUPPLY, + ALLOWED_MARKER_TYPES, + ALLOWED_NAGOYA_OPTIONS, + ALLOWED_PLOIDIES, + ALLOWED_RESTRICTION_USE_OPTIONS, + ALLOWED_RISK_GROUPS, + ALLOWED_SUBTAXA, + ALLOWED_TAXONOMIC_RANKS, + APPLICATIONS, + COLLECT, + COLLECTED_BY, + COLLECTION_CODE, + COMMENTS_ON_TAXONOMY, + DATE_OF_COLLECTION, + DATE_OF_INCLUSION, + DATE_OF_ISOLATION, + DEPOSIT, + DEPOSITOR, + DUAL_USE, + ENZYME_PRODUCTION, + FORM_OF_SUPPLY, + GENETICS, + GENOTYPE, + GENUS, + GMO, + GMO_CONSTRUCTION_INFO, + GROWTH, + HISTORY_OF_DEPOSIT, + INFRASUBSPECIFIC_NAME, + INTERSPECIFIC_HYBRID, + ISOLATED_BY, + ISOLATION, + ISOLATION_HABITAT, LITERATURE_LINKED_TO_SEQ_GENOME, + LOCATION, + MARKER_INSDC, + MARKER_SEQ, + MARKER_TYPE, + MARKERS, + MTA_FILES, + MUTANT_INFORMATION, + NAGOYA_PROTOCOL, + ONTOBIOTOPE_ISOLATION_HABITAT, + ORGANISM_TYPE, + OTHER_CULTURE_NUMBERS, + PATHOGENICITY, PLANT_PATHOGENICITY_CODE, + PLASMIDS, + PLASMIDS_COLLECTION_FIELDS, + PLOIDY, + PRODUCTION_OF_METABOLITES, + PUBLICATIONS, + QUARANTINE, + RECOMMENDED_GROWTH_MEDIUM, + RECOMMENDED_GROWTH_TEMP, + REMARKS, + RESTRICTION_ON_USE, + RISK_GROUP, + SEXUAL_STATE, + SPECIES, + STATUS, + STRAIN_FROM_REGISTERED_COLLECTION, + STRAIN_ID, + STRAIN_PUI, + STRAIN_URL, + SUBSTRATE_HOST_OF_ISOLATION, + ID_SYNONYMS, + TAXONOMY, + TESTED_TEMPERATURE_GROWTH_RANGE, SUBTAXAS, DATE_OF_DEPOSIT, HYBRIDS, +) + +RANK_TRANSLATOR = { + "subspecies": "subsp.", + "convarietas": "convar.", + "variety": "var.", + "group": "Group", + "forma": "f.", + "forma.specialis": 'f.sp.' +} + +# ORG_TYPES = { +# "algae": 1, +# "archaea": 2, +# "bacteria": 3, +# "fungi": 4, +# "virus": 5, +# "yeast": 6, +# } + +ORG_TYPES = { + "Algae": 1, + "Archaea": 2, + "Bacteria": 3, + "Cyanobacteria": 4, + "Filamentous Fungi": 5, + "Phage": 6, + "Plasmid": 7, + "Virus": 8, + "Yeast": 9, +} + + +class OrganismType(FrozenClass): + + def __init__(self, value=None): + self._data = {} + self.guess_type(value) + self._freeze() + + def dict(self): + return self._data + + def __str__(self): + return f"{self.code} {self.name}" + + @property + def code(self): + return self._data.get("code", None) + + @code.setter + def code(self, code: int): + try: + code = int(code) + except TypeError as error: + msg = f"code {code} not accepted for organism type" + raise ValidationError(msg) from error + + if code not in ORG_TYPES.values(): + msg = f"code {code} not accepted for organism type" + raise ValidationError(msg) + self._data["code"] = code + name = None + for _name, _code in ORG_TYPES.items(): + if _code == code: + name = _name + self._data["name"] = name + + @property + def name(self): + return self._data.get("name", None) + + @name.setter + def name(self, name: str): + error_msg = f"name {name} not accepted for organism type" + accepted_types = ORG_TYPES.keys() + if name not in accepted_types: + raise ValidationError(error_msg) + self._data["name"] = name # TODO: are we case sensitive? + self._data["code"] = ORG_TYPES[name] + + def guess_type(self, value): + if value is None or not value: + raise ValueError(" Can not set an empty value") + try: + value = int(value) + self.code = value + except ValueError: + self.name = value + + +class Taxonomy(FrozenClass): + def __init__(self, data=None): + self._data = {} + if data is not None: + if ORGANISM_TYPE in data: + self.organism_type = [OrganismType(ot) + for ot in data[ORGANISM_TYPE]] + if GENUS in data: + self.genus = data[GENUS] + if SPECIES in data: + self.species = data[SPECIES] + if INFRASUBSPECIFIC_NAME in data: + self.infrasubspecific_name = data[INFRASUBSPECIFIC_NAME] + if COMMENTS_ON_TAXONOMY in data: + self.comments = data[COMMENTS_ON_TAXONOMY] + if INTERSPECIFIC_HYBRID in data: + self.interspecific_hybrid = data[INTERSPECIFIC_HYBRID] + if HYBRIDS in data: + self.hybrids = data[HYBRIDS] + + self._freeze() + + def __bool__(self): + return bool(self._data) + + def dict(self): + data = {} + for key, value in self._data.items(): + if value is None: + continue + if key == ORGANISM_TYPE: + value = [val.dict() for val in value] + data[key] = value + return data + + def __getitem__(self, key): + return self._data[key] + + @property + def organism_type(self): + return self._data.get(ORGANISM_TYPE, None) + + @organism_type.setter + def organism_type(self, organism_type: List[OrganismType]): + if isinstance(organism_type, list) and all( + isinstance(x, OrganismType) for x in organism_type + ): + self._data[ORGANISM_TYPE] = organism_type + else: + msg = "organism_type must be a list of OrganismType instances" + raise ValidationError(msg) + + @property + def infrasubspecific_name(self): + return self._data.get(INFRASUBSPECIFIC_NAME, None) + + @infrasubspecific_name.setter + def infrasubspecific_name(self, name): + self._data[INFRASUBSPECIFIC_NAME] = name + + @property + def comments(self): + return self._data.get(COMMENTS_ON_TAXONOMY, None) + + @comments.setter + def comments(self, comments): + self._data[COMMENTS_ON_TAXONOMY] = comments + + @property + def interspecific_hybrid(self): + return self._data.get(INTERSPECIFIC_HYBRID, None) + + @interspecific_hybrid.setter + def interspecific_hybrid(self, interspecific_hybrid): + self._data[INTERSPECIFIC_HYBRID] = interspecific_hybrid + + @property + def genus(self): + return self._data.get(GENUS, {}).get("name", None) + + @genus.setter + def genus(self, genus): + if GENUS not in self._data: + self._data[GENUS] = {} + self._data[GENUS]["name"] = genus + + @property + def species(self): + return self._data.get(SPECIES, {}).get("name", None) + + @species.setter + def species(self, species): + self._data[SPECIES] = {"name": species} + + @property + def species_author(self): + return self._data.get(SPECIES, {}).get("author", None) + + @species_author.setter + def species_author(self, species_author): + if not self.species: + msg = "Can not set species author if species is not set" + raise ValidationError(msg) + self._data[SPECIES]["author"] = species_author + + @property + def hybrids(self) -> list[str]: + return self._data.get(HYBRIDS, None) + + @hybrids.setter + def hybrids(self, hybrids: List[str]): + if isinstance(hybrids, (tuple, list)): + self._data[HYBRIDS] = hybrids + @property + def subtaxas(self): + return { + key: value for key, value in self._data.items() if key in ALLOWED_SUBTAXA + } + + def get_subtaxa_name(self, rank): + return self._data.get(rank, {}).get("name", None) + + def get_subtaxa_author(self, rank): + return self._data.get(rank, {}).get("author", None) + + def set_subtaxa_name(self, rank, name): + if rank in ALLOWED_SUBTAXA: + self._data[rank] = {"name": name} + + def set_subtaxa_author(self, rank, author): + if rank in ALLOWED_SUBTAXA and self.get_subtaxa_name(rank): + self._data[rank]["author"] = author + + def add_subtaxa(self, subtaxa_rank, subtaxa_name, subtaxa_author=None): + if subtaxa_rank not in ALLOWED_SUBTAXA: + raise ValidationError("{} Rank not allowed".format(subtaxa_rank)) + if subtaxa_rank not in self._data: + self._data[subtaxa_rank] = {} + self._data[subtaxa_rank] = {"name": subtaxa_name} + if subtaxa_author: + self._data[subtaxa_rank]["author"] = subtaxa_author + + @property + def long_name(self): + # from multicrop passport descriptors 2.1 + # ‘subsp.’ (for subspecies); ‘convar.’ (for convariety); + # ‘var.’ (for variety); ‘f.’ (for form); + # ‘Group’ (for ‘cultivar group’) + # f.sp. for forma.specialis + if self.hybrids: + return ';'.join(self.hybrids) + + taxas = [] + for rank in ALLOWED_TAXONOMIC_RANKS: + value = self.get_subtaxa_name(rank) + if value: + rank = RANK_TRANSLATOR.get(rank, None) + if rank: + taxas.append(rank) + taxas.append(value) + return " ".join(taxas) if taxas else None + + @property + def taxons(self): + taxons = OrderedDict() + for rank in ALLOWED_TAXONOMIC_RANKS: + taxa = self._data.get(rank, {}).get("name", None) + author = self._data.get(rank, {}).get("author", None) + if taxa: + if author: + taxa += " " + author + taxons[rank] = taxa + return taxons + + @property + def composed_taxons(self): + taxas = [] + for rank in ALLOWED_TAXONOMIC_RANKS: + value = self.get_subtaxa_name(rank) + # print(value, rank) + if value: + rank_trans = RANK_TRANSLATOR.get(rank, None) + if rank_trans: + taxas.extend([rank_trans, value]) + else: + + taxas.append(value) + yield rank, " ".join(taxas) + if rank == "family": + taxas = [] + + +class _GeneralStep(FrozenClass): + _date_tag = None + _who_tag = None + _location_tag = None + + def __init__(self, data=None): + self._data = {} + if data is None: + data = {} + if self._location_tag is not None: + self.location = Location(data.get(self._location_tag, None)) + if self._date_tag: + self.who = data.get(self._who_tag, None) + if self._date_tag: + _date = DateRange() + if data and self._date_tag in data: + _date = _date.strpdate(data[self._date_tag]) + self.date = _date + + def __bool__(self): + return bool(self.location) or bool(self.date) or bool(self.who) + + @property + def location(self) -> Location: + return self._data.get(self._location_tag, None) + + @location.setter + def location(self, location: Location): + if self._location_tag is None: + return ValidationError("Can't set location on this class") + if not isinstance(location, Location): + raise ValidationError("Location must be a Location instance") + self._data[self._location_tag] = location + + @property + def who(self) -> str: + return self._data.get(self._who_tag, None) + + @who.setter + def who(self, by_who: str): + if self._who_tag is None: + return ValidationError("Can set who on this class") + self._data[self._who_tag] = by_who + + @property + def date(self) -> DateRange: + return self._data.get(self._date_tag, None) + + @date.setter + def date(self, _date: DateRange): + if self._date_tag is None: + return ValidationError("Can set date on this class") + if _date is not None: + if not isinstance(_date, DateRange): + raise ValidationError("Date must be a DateRange instance") + self._data[self._date_tag] = _date + + def dict(self): + _data = {} + if self.location: + _data[self._location_tag] = self.location.dict() + if self.who: + _data[self._who_tag] = self._data[self._who_tag] + if self.date: + _data[self._date_tag] = self._data[self._date_tag].strfdate + return _data + + +class Collect(_GeneralStep): + _date_tag = DATE_OF_COLLECTION + _who_tag = COLLECTED_BY + _location_tag = LOCATION + + def __init__(self, data=None): + super().__init__(data=data) + if data is None: + data = {} + + self.habitat = data.get(ISOLATION_HABITAT, None) + self.habitat_ontobiotope = data.get(ONTOBIOTOPE_ISOLATION_HABITAT, + None) + self._freeze() + + def __str__(self): + info = "" + if self.location: + info += f"{pycountry.countries.get(alpha_3=str(self.location.country)).name}" + if self.date: + info += f" in {self.date.strfdate}" + if self.who: + info += f" by {self.who}" + if info: + info = f"Collected: {info}" + return info + + def dict(self): + _data = super().dict() + if ISOLATION_HABITAT in self._data: + _data[ISOLATION_HABITAT] = self._data[ISOLATION_HABITAT] + if ONTOBIOTOPE_ISOLATION_HABITAT in self._data: + ontotype = self._data[ONTOBIOTOPE_ISOLATION_HABITAT] + _data[ONTOBIOTOPE_ISOLATION_HABITAT] = ontotype + + return _data + + @property + def habitat(self): + return self._data.get(ISOLATION_HABITAT, None) + + @habitat.setter + def habitat(self, habitat: str): + if habitat is not None: + self._data[ISOLATION_HABITAT] = habitat + + @property + def habitat_ontobiotope(self): + return self._data.get(ONTOBIOTOPE_ISOLATION_HABITAT, None) + + @habitat_ontobiotope.setter + def habitat_ontobiotope(self, habitat: str): + if habitat is not None: + if not re.match("OB[ST]:[0-9]{6}", habitat): + raise ValidationError( + f"Bad ontobiotope format, {habitat}") + self._data[ONTOBIOTOPE_ISOLATION_HABITAT] = habitat + + +class Isolation(_GeneralStep): + _who_tag = ISOLATED_BY + _date_tag = DATE_OF_ISOLATION + + def __init__(self, data=None): + if data is None: + data = {} + super().__init__(data=data) + _date = DateRange() + + self.substrate_host_of_isolation = data.get(SUBSTRATE_HOST_OF_ISOLATION, + None) + self._freeze() + + def dict(self): + _data = super().dict() + return _data + + @property + def substrate_host_of_isolation(self): + return self._data.get(SUBSTRATE_HOST_OF_ISOLATION, None) + + @substrate_host_of_isolation.setter + def substrate_host_of_isolation(self, value: str): + if value is not None: + self._data[SUBSTRATE_HOST_OF_ISOLATION] = value + + +class Deposit(_GeneralStep): + _who_tag = DEPOSITOR + _date_tag = DATE_OF_DEPOSIT + + def __init__(self, data=None): + if data is None: + data = {} + super().__init__(data=data) + self._freeze() + + +class StrainId(FrozenClass): + def __init__(self, id_dict=None, collection=None, number=None): + if id_dict and (collection or number): + msg = "Can not initialize with dict and number or collection" + raise ValidationError(msg) + if id_dict is None: + id_dict = {} + self._id_dict = id_dict + if collection: + self.collection = collection + if number: + self.number = number + self._freeze() + + def __bool__(self): + return bool(self._id_dict) + + def __eq__(self, other): + return self.collection == other.collection and self.number == other.number + + def __ne__(self, other): + return not self.__eq__(other) + + def __str__(self): + if self.number is None and self.collection is None: + return None + _id = '' + if self.collection is not None: + _id += f'{self.collection} ' + _id += self.number + return _id + + def dict(self): + return self._id_dict + + @property + def strain_id(self): + return self.__str__() + + @property + def collection(self): + return self._id_dict.get(COLLECTION_CODE, None) + + @collection.setter + def collection(self, collection): + assert collection and isinstance(collection, str) + self._id_dict[COLLECTION_CODE] = collection + + @property + def number(self): + return self._id_dict.get(ACCESSION_NUMBER, None) + + @number.setter + def number(self, germplasm_number): + assert germplasm_number and isinstance(germplasm_number, str) + self._id_dict[ACCESSION_NUMBER] = germplasm_number + + @property + def pui(self): + return self._id_dict.get(STRAIN_PUI, None) + + @pui.setter + def pui(self, pui): + assert pui and isinstance(pui, str) + self._id_dict[STRAIN_PUI] = pui + + @property + def url(self): + return self._id_dict.get(STRAIN_URL, None) + + @url.setter + def url(self, url): + assert url and isinstance(url, str) + self._id_dict[STRAIN_URL] = url + + def keys(self): + return self._id_dict.keys() + + def copy(self): + return StrainId(self._id_dict) + + +class Genetics(FrozenClass): + def __init__(self, data=None): + self._data = {} + if data and SEXUAL_STATE in data: + self.sexual_state = data[SEXUAL_STATE] + if data and PLOIDY in data: + self.ploidy = data[PLOIDY] + if data and GMO in data: + self.gmo = data[GMO] + if data and MUTANT_INFORMATION in data: + self.mutant_info = data[MUTANT_INFORMATION] + if data and GMO_CONSTRUCTION_INFO in data: + self.gmo_construction = data[GMO_CONSTRUCTION_INFO] + if data and GENOTYPE in data: + self.genotype = data[GENOTYPE] + + if data and MARKERS in data: + self.markers = [ + GenomicSequence(marker_data) for marker_data in data[MARKERS] + ] + else: + self.markers = [] + self._freeze() + + def __bool__(self): + data = deepcopy(self._data) + if MARKERS in data: + markers = data.pop(MARKERS) + return bool(markers or data) + else: + return bool(data) + + def dict(self): + data = {} + for key, value in self._data.items(): + if value is None or value == []: + continue + elif isinstance(value, list): + a = [] + for v in value: + if not isinstance(v, str): + a.append(v.dict()) + else: + a.append(v) + value = a + data[key] = value + return data + + @property + def sexual_state(self) -> str: + return self._data.get(SEXUAL_STATE, None) + + @sexual_state.setter + def sexual_state(self, state: str): + self._data[SEXUAL_STATE] = state + + @property + def ploidy(self) -> int: + return self._data.get(PLOIDY, None) + + @ploidy.setter + def ploidy(self, value: int): + if value is not None: + if value not in ALLOWED_PLOIDIES: + msg = f"{value} not in allowed ploidies: " + msg += f'{", ".join(str(p) for p in ALLOWED_PLOIDIES)}' + raise ValidationError(msg) + self._data[PLOIDY] = value + + @property + def gmo(self) -> bool: + return self._data.get(GMO, None) + + @gmo.setter + def gmo(self, value: bool): + if value is not None and not isinstance(value, bool): + raise ValidationError("Gmos value must be boolean") + self._data[GMO] = value + + @property + def gmo_construction(self) -> str: + return self._data.get(GMO_CONSTRUCTION_INFO, None) + + @gmo_construction.setter + def gmo_construction(self, value: str): + self._data[GMO_CONSTRUCTION_INFO] = value + + @property + def mutant_info(self) -> str: + return self._data.get(MUTANT_INFORMATION, None) + + @mutant_info.setter + def mutant_info(self, value: str): + self._data[MUTANT_INFORMATION] = value + + @property + def genotype(self) -> str: + return self._data.get(GENOTYPE, None) + + @genotype.setter + def genotype(self, value: str): + self._data[GENOTYPE] = value + + @property + def plasmids(self) -> List[str]: + return self._data.get(PLASMIDS, None) + + @plasmids.setter + def plasmids(self, value: List[str]): + self._data[PLASMIDS] = value + + @property + def plasmids_in_collections(self): + return self._data.get(PLASMIDS_COLLECTION_FIELDS, None) + + @plasmids_in_collections.setter + def plasmids_in_collections(self, value: List[str]): + self._data[PLASMIDS_COLLECTION_FIELDS] = value + + @property + def markers(self) -> List[GenomicSequence]: + return self._data.get(MARKERS, None) + + @markers.setter + def markers(self, value: List[GenomicSequence]): + for marker in value: + if not isinstance(marker, GenomicSequence): + msg = "Markers needs to be a GenomicSecuence instances list" + raise ValidationError(msg) + self._data[MARKERS] = value + + +class Growth(_FieldBasedClass): + _fields = [ + {"attribute": "tested_temp_range", "label": TESTED_TEMPERATURE_GROWTH_RANGE}, + {"attribute": "recommended_media", "label": RECOMMENDED_GROWTH_MEDIUM}, + {"attribute": "recommended_temp", "label": RECOMMENDED_GROWTH_TEMP}, + ] + + @property + def tested_temp_range(self) -> dict: + return self._data.get(TESTED_TEMPERATURE_GROWTH_RANGE, None) + + @tested_temp_range.setter + def tested_temp_range(self, val: dict): + if val is not None: + if "min" in val and "max" in val: + self._data[TESTED_TEMPERATURE_GROWTH_RANGE] = val + else: + msg = "A dict with min and max is required" + raise ValidationError(msg) + + @property + def recommended_media(self) -> List[str]: + return self._data.get(RECOMMENDED_GROWTH_MEDIUM, None) + + @recommended_media.setter + def recommended_media(self, value): + if value is not None: + if not isinstance(value, (list, set)): + msg = "Recommendedn media must be a list" + raise ValidationError(msg) + self._data[RECOMMENDED_GROWTH_MEDIUM] = value + + @property + def recommended_temp(self) -> dict: + return self._data.get(RECOMMENDED_GROWTH_TEMP, None) + + @recommended_temp.setter + def recommended_temp(self, val: dict): + if val is not None: + if isinstance(val, dict) and "min" in val and "max" in val: + self._data[RECOMMENDED_GROWTH_TEMP] = val + else: + msg = "A dict with min and max is required" + raise ValidationError(msg) + + +class Strain(FrozenClass): + def __init__(self, data=None): + self._data = {} + if data is None: + data = {} + self.nagoya_protocol = data.get(NAGOYA_PROTOCOL, None) + self.risk_group = data.get(RISK_GROUP, None) + self.restriction_on_use = data.get(RESTRICTION_ON_USE, None) + self.status = data.get(STATUS, None) + self.abs_related_files = data.get(ABS_RELATED_FILES, None) + self.mta_files = data.get(MTA_FILES, None) + self.is_potentially_harmful = data.get(DUAL_USE, None) + self.is_from_registered_collection = data.get( + STRAIN_FROM_REGISTERED_COLLECTION, None + ) + self.is_subject_to_quarantine = data.get(QUARANTINE, None) + inclusion_date = data.get(DATE_OF_INCLUSION, None) + if inclusion_date: + _date = DateRange() + inclusion_date = _date.strpdate(inclusion_date) + self.catalog_inclusion_date = inclusion_date + + self.id = StrainId(data.get(STRAIN_ID, None)) + + self.taxonomy = Taxonomy(data.get(TAXONOMY, None)) + + self.deposit = Deposit(data.get(DEPOSIT, None)) + + self.collect = Collect(data.get(COLLECT, None)) + + self.isolation = Isolation(data.get(ISOLATION, None)) + + self.growth = Growth(data.get(GROWTH, None)) + + self.genetics = Genetics(data.get(GENETICS, None)) + + self.other_numbers = [] + if data and OTHER_CULTURE_NUMBERS in data: + for other_number in data[OTHER_CULTURE_NUMBERS]: + self.other_numbers.append(StrainId(other_number)) + + self.publications = [] + if data and PUBLICATIONS in data: + for pub in data[PUBLICATIONS]: + self.publications.append(Publication(pub)) + self._freeze() + + def __str__(self): + return f"Strain {self.id.collection} {self.id.number}" + + def dict(self): + data = {} + for field, value in self._data.items(): + if field in [STRAIN_ID, COLLECT, DEPOSIT, ISOLATION, GROWTH, + GENETICS, TAXONOMY]: + value = value.dict() + if value == {}: + value = None + + elif field in [OTHER_CULTURE_NUMBERS, PUBLICATIONS, ID_SYNONYMS]: + value = [item.dict() for item in value] + if value == []: + value = None + elif field == DATE_OF_INCLUSION: + value = value.strfdate + 0 + if value is not None: + data[field] = value + + return data + + @property + def id(self) -> StrainId: + return self._data.get(STRAIN_ID, None) + + @id.setter + def id(self, _id: StrainId): + self._data[STRAIN_ID] = _id + + @property + def synonyms(self) -> List[StrainId]: + return self._data.get(ID_SYNONYMS, None) + + @synonyms.setter + def synonyms(self, ids: List[StrainId]): + self._data[ID_SYNONYMS] = ids + + @property + def nagoya_protocol(self) -> str: + return self._data.get(NAGOYA_PROTOCOL, None) + + @nagoya_protocol.setter + def nagoya_protocol(self, nagoya): + if nagoya is not None: + if nagoya not in ALLOWED_NAGOYA_OPTIONS: + msg = "The 'Nagoya protocol restrictions and compliance " + msg += "conditions' for strain with Accession Number " + msg += f"{self.id.collection}{self.id.number} is not " + msg += "according to the specification." + # msg = f"Nagoya protocol options not matched: {nagoya}" + # msg += f' options: {", ".join(ALLOWED_NAGOYA_OPTIONS)}' + raise ValidationError(msg) + self._data[NAGOYA_PROTOCOL] = nagoya + + @property + def risk_group(self) -> str: + return self._data.get(RISK_GROUP, None) + + @risk_group.setter + def risk_group(self, risk_gr: Union[str, int, None]): + # we have to check if there are some more options + if risk_gr is not None: + risk_gr = str(risk_gr) + if risk_gr not in ALLOWED_RISK_GROUPS: + msg = "The 'Risk group' for strain with Accession Number " + msg += f"{self.id.collection}{self.id.number} is not according " + msg += "to specification." + # msg = f"Value ({risk_gr}) not in the allowed options: " + # msg += f"{', '.join(ALLOWED_RISK_GROUPS)}" + raise ValidationError(msg) + self._data[RISK_GROUP] = str(risk_gr) + + @property + def restriction_on_use(self) -> Union[str, None]: + return self._data.get(RESTRICTION_ON_USE, None) + + @restriction_on_use.setter + def restriction_on_use(self, restriction: str): + + if restriction is not None: + if restriction not in ALLOWED_RESTRICTION_USE_OPTIONS: + msg = "The 'Restriction on use' for strain with Accession " + msg += f"Number {self.id.collection} {self.id.number} is not " + msg += "according to the specification." + raise ValidationError(msg) + + self._data[RESTRICTION_ON_USE] = restriction + + @property + def is_potentially_harmful(self) -> bool: # can_be_use_as_weapon + return self._data.get(DUAL_USE, None) + + @is_potentially_harmful.setter + def is_potentially_harmful(self, is_harmful: bool): + # Specify whether the strain has the potential for a harmful use + # according to import pprint + # EU Council Regulation 2000/1334/CEand its amendments + # and corrections + if is_harmful is not None: + if not isinstance(is_harmful, bool): + msg = "is_potentially harmful must be True/False" + raise ValidationError(msg) + self._data[DUAL_USE] = is_harmful + + @property + def is_subject_to_quarantine(self) -> bool: + return self._data[QUARANTINE] + + @is_subject_to_quarantine.setter + def is_subject_to_quarantine(self, quarantine: bool): + if quarantine is not None and not isinstance(quarantine, bool): + msg = "Is subject to quarantine must be boolean" + raise ValidationError(msg) + self._data[QUARANTINE] = quarantine + + @property + def is_from_registered_collection(self) -> bool: + return self._data.get(STRAIN_FROM_REGISTERED_COLLECTION, None) + + @is_from_registered_collection.setter + def is_from_registered_collection(self, value: bool): + if value is not None: + if not isinstance(value, bool): + msg = "is from reg_collection must be boolean" + raise ValidationError(msg) + + self._data[STRAIN_FROM_REGISTERED_COLLECTION] = value + + @property + def catalog_inclusion_date(self) -> DateRange: + return self._data.get(DATE_OF_INCLUSION, None) + + @catalog_inclusion_date.setter + def catalog_inclusion_date(self, _date: Union[None, DateRange]): + if _date is not None: + if not isinstance(_date, DateRange): + raise ValidationError("Date must be a DateRange instance") + self._data[DATE_OF_INCLUSION] = _date + + @property + def abs_related_files(self) -> List[str]: + return self._data.get(ABS_RELATED_FILES, None) + + @abs_related_files.setter + def abs_related_files(self, value: List[str]): + if value is not None and not isinstance(value, list): + raise ValidationError("Value must be a list") + if value is not None: + self._data[ABS_RELATED_FILES] = value + + @property + def mta_files(self) -> List[str]: + return self._data.get(MTA_FILES, None) + + @mta_files.setter + def mta_files(self, value: List[str]): + if value is not None and not isinstance(value, list): + raise ValidationError("Value must be a list") + if value is not None: + self._data[MTA_FILES] = value + + @property + def other_numbers(self) -> List[StrainId]: + return self._data.get(OTHER_CULTURE_NUMBERS, None) + + @other_numbers.setter + def other_numbers(self, value: List[StrainId]): + for on in value: + if not isinstance(on, StrainId): + msg = "Other number must be a list of Strain Id instances" + raise ValidationError(msg) + self._data[OTHER_CULTURE_NUMBERS] = value + + @property + def other_denominations(self) -> List[str]: + return self._data.get(ACCESSION_NAME, None) + + @other_denominations.setter + def other_denominations(self, value: List[str]): + self._data[ACCESSION_NAME] = value + + @property + def history(self) -> Union[List[str], None]: + return self._data.get(HISTORY_OF_DEPOSIT) + + @history.setter + def history(self, value: Union[str, None]): + if value: + value = [item.strip() for item in value.split("<")] + value = list(filter(bool, value)) + self._data[HISTORY_OF_DEPOSIT] = value + + @property + def form_of_supply(self) -> List[str]: + return self._data.get(FORM_OF_SUPPLY, None) + + @form_of_supply.setter + def form_of_supply(self, value: List[str]): + allowed = {f.lower() for f in ALLOWED_FORMS_OF_SUPPLY} + if {v.lower() for v in value}.difference(allowed): + msg = f"Not allowed forms of supply {value}: " + msg += f"{', '.join(ALLOWED_FORMS_OF_SUPPLY)}" + raise ValidationError(msg) + self._data[FORM_OF_SUPPLY] = value + + @property + def taxonomy(self) -> Taxonomy: + return self._data.get(TAXONOMY, None) + + @taxonomy.setter + def taxonomy(self, value: Taxonomy): + self._data[TAXONOMY] = value + + @property + def collect(self) -> Collect: + return self._data.get(COLLECT, None) + + @collect.setter + def collect(self, _collect: Collect): + self._data[COLLECT] = _collect + + @property + def deposit(self) -> Deposit: + return self._data.get(DEPOSIT, None) + + @deposit.setter + def deposit(self, _deposit: Deposit): + self._data[DEPOSIT] = _deposit + + @property + def isolation(self) -> Isolation: + return self._data.get(ISOLATION, None) + + @isolation.setter + def isolation(self, _isolation: Isolation): + self._data[ISOLATION] = _isolation + + @property + def growth(self) -> Growth: + return self._data.get(GROWTH, None) + + @growth.setter + def growth(self, _growth: Growth): + self._data[GROWTH] = _growth + + @property + def genetics(self) -> Genetics: + return self._data.get(GENETICS, None) + + @genetics.setter + def genetics(self, _genetics: Genetics): + self._data[GENETICS] = _genetics + + @property + def publications(self) -> Union[List[Publication], None]: + return self._data.get(PUBLICATIONS, None) + + @publications.setter + def publications(self, value: List[Publication]): + if value is not None: + error_msg = "Publications must be list Publication instances" + if not isinstance(value, list): + raise ValidationError(error_msg) + for pub in value: + if not isinstance(pub, Publication): + raise ValidationError(error_msg) + self._data[PUBLICATIONS] = value + + # mierder + @property + def pathogenicity(self) -> str: + return self._data.get(PATHOGENICITY, None) + + @pathogenicity.setter + def pathogenicity(self, value: str): + self._data[PATHOGENICITY] = value + + @property + def enzyme_production(self) -> str: + return self._data.get(ENZYME_PRODUCTION, None) + + @enzyme_production.setter + def enzyme_production(self, value: str): + if value: + self._data[ENZYME_PRODUCTION] = value + + @property + def production_of_metabolites(self) -> str: + return self._data.get(PRODUCTION_OF_METABOLITES, None) + + @production_of_metabolites.setter + def production_of_metabolites(self, value: str): + self._data[PRODUCTION_OF_METABOLITES] = value + + @property + def remarks(self) -> str: + return self._data.get(REMARKS, None) + + @remarks.setter + def remarks(self, value: str): + self._data[REMARKS] = value + + @property + def applications(self) -> str: + return self._data.get(APPLICATIONS, None) + + @applications.setter + def applications(self, value: str): + self._data[APPLICATIONS] = value + + @property + def status(self) -> str: + return self._data.get(STATUS, None) + + @status.setter + def status(self, value: str): + self._data[STATUS] = value + + @property + def plant_pathogenicity_code(self) -> str: + return self._data.get(PLANT_PATHOGENICITY_CODE, None) + + @plant_pathogenicity_code.setter + def plant_pathogenicity_code(self, value: str): + self._data[PLANT_PATHOGENICITY_CODE] = value + + @property + def literature_linked_to_the_sequence_genome(self) -> str: + return self._data.get(LITERATURE_LINKED_TO_SEQ_GENOME, None) + + @literature_linked_to_the_sequence_genome.setter + def literature_linked_to_the_sequence_genome(self, value: str): + self._data[LITERATURE_LINKED_TO_SEQ_GENOME] = value + + +class StrainMirri(Strain): + + @property + def record_id(self): + return self._data.get('record_id', None) + + @record_id.setter + def record_id(self, value: int): + self._data['record_id'] = value + + @property + def record_name(self): + return self._data.get('record_name', None) + + @record_name.setter + def record_name(self, value: int): + self._data['record_name'] = value + + +def add_taxon_to_strain(strain, value): + value = value.strip() + if not value: + return + if "*" in value or "×" in value: + spps = re.split('\*|×', value) + sp1 = spps[0] + sp2 = f'{spps[0].split()[0]} {spps[1]}' + spps = [sp1, sp2] + else: + spps = [v.strip() for v in value.split(';')] + + if len(spps) == 2: + strain.taxonomy.hybrids = spps + strain.taxonomy.interspecific_hybrid = True + return + value = spps[0] + items = re.split(r" +", value) + genus = items[0] + strain.taxonomy.genus = genus + if len(items) > 1: + species = items[1] + if species in ("sp", "spp", ".sp", "sp."): + species = None + return + strain.taxonomy.species = species + + if len(items) > 2: + rank = None + name = None + for index in range(0, len(items[2:]), 2): + rank = SUBTAXAS.get(items[index + 2], None) + if rank is None: + raise ValidationError( + f'The "Taxon Name" for strain with accession number {strain.id.collection} {strain.id.number} is not according to specification.' + ) + + name = items[index + 3] + strain.taxonomy.add_subtaxa(rank, name) diff --git a/mirri/io/__init__.py b/mirri/io/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/io/parsers/__init__.py b/mirri/io/parsers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/io/parsers/excel.py b/mirri/io/parsers/excel.py new file mode 100644 index 0000000..ec11449 --- /dev/null +++ b/mirri/io/parsers/excel.py @@ -0,0 +1,79 @@ +from io import BytesIO +from openpyxl import load_workbook + + +def excel_dict_reader(fhand, sheet_name, mandatory_column_name=None): + fhand.seek(0) + wb = load_workbook(filename=BytesIO(fhand.read()), data_only=True, + read_only=True) + return workbook_sheet_reader(wb, sheet_name, mandatory_column_name=mandatory_column_name) + + +def is_none(value): + return value is None + + +def workbook_sheet_reader(workbook, sheet_name, mandatory_column_name=None, + allowed_empty_line_slots=5): + try: + sheet = workbook[sheet_name] + except KeyError as error: + raise ValueError(f"The '{sheet_name}' sheet is missing.") from error + + first = True + header = [] + empty_lines = 0 + for row in sheet.rows: + values = [] + for cell in row: + if cell.value is not None and cell.data_type == 's': + value = str(cell.value).strip() + else: + value = cell.value + values.append(value) + # values = [cell.value.strip() for cell in row] + if first: + header = values + first = False + continue + if not any(values): + empty_lines += 1 + if empty_lines >= allowed_empty_line_slots: + break + continue + empty_lines = 0 + + data = dict(zip(header, values)) + if mandatory_column_name is not None and not data[mandatory_column_name]: + # msg = f"Exiting before end of sheet {sheet_name} ends.\n" + # msg += f"Mandatory column ({mandatory_column_name}) empty. \n" + # msg += "Check file for empty lines" + # print(msg) + continue + yield data + + +def get_all_cell_data_from_sheet(workbook, sheet_name, allowed_empty_line_slots=5): + try: + sheet = workbook[sheet_name] + except KeyError as error: + raise ValueError(f"The '{sheet_name}' sheet is missing.") from error + + empty_lines = 0 + all_values = [] + for row in sheet.rows: + values = [] + for cell in row: + if cell.value is not None and cell.data_type == 's': + value = str(cell.value).strip() + else: + value = cell.value + values.append(value) + if not any(values): + empty_lines += 1 + if empty_lines >= allowed_empty_line_slots: + break + continue + empty_lines = 0 + all_values.extend(values) + return all_values diff --git a/mirri/io/parsers/mirri_excel.py b/mirri/io/parsers/mirri_excel.py new file mode 100644 index 0000000..549ae3d --- /dev/null +++ b/mirri/io/parsers/mirri_excel.py @@ -0,0 +1,276 @@ +import re +from datetime import date +from io import BytesIO + +import pycountry +from openpyxl import load_workbook + +from mirri import rsetattr, ValidationError +from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics +from mirri.biolomics.serializers.strain import StrainMirri +from mirri.entities.growth_medium import GrowthMedium +from mirri.io.parsers.excel import workbook_sheet_reader +from mirri.entities.publication import Publication +from mirri.entities.date_range import DateRange +from mirri.entities.strain import OrganismType, StrainId, add_taxon_to_strain +from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO, + GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS, + MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS, + NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION, + ONLY_RESEARCH, ONTOBIOTOPE, + PUBLICATION_FIELDS, STRAINS, SUBTAXAS) +from mirri.utils import get_country_from_name + +RESTRICTION_USE_TRANSLATOR = { + 1: NO_RESTRICTION, + 2: ONLY_RESEARCH, + 3: COMMERCIAL_USE_WITH_AGREEMENT, +} +NAGOYA_TRANSLATOR = { + 1: NAGOYA_NO_RESTRICTIONS, + 2: NAGOYA_DOCS_AVAILABLE, + 3: NAGOYA_PROBABLY_SCOPE, +} +TRUEFALSE_TRANSLATOR = { + 1: False, + 2: True +} + + +def parse_mirri_excel(fhand, version="20200601"): + if version == "20200601": + return _parse_mirri_v20200601(fhand) + else: + raise NotImplementedError("Only version 20200601 is implemented") + + +def _parse_mirri_v20200601(fhand): + fhand.seek(0) + file_content = BytesIO(fhand.read()) + wb = load_workbook(filename=file_content, read_only=True, data_only=True) + + locations = workbook_sheet_reader(wb, LOCATIONS) + ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE) + + growth_media = list(parse_growth_media(wb)) + + markers = workbook_sheet_reader(wb, GENOMIC_INFO) + + publications = list(parse_publications(wb)) + + strains = parse_strains(wb, locations=locations, growth_media=growth_media, + markers=markers, publications=publications, + ontobiotopes=ontobiotopes) + + return {"strains": strains, "growth_media": growth_media} + + +def index_list_by(list_, id_): + return {str(item[id_]): item for item in list_} + + +def index_list_by_attr(list_, id_): + return {str(getattr(item, id_)): item for item in list_} + + +def index_markers(markers): + indexed_markers = {} + for marker in markers: + strain_id = marker["Strain AN"] + if strain_id not in indexed_markers: + indexed_markers[strain_id] = [] + indexed_markers[strain_id].append(marker) + return indexed_markers + + +def remove_hard_lines(string=None): + if string is not None and string != '': + return re.sub(r'\r+\n+|\t+', '', string).strip() + else: + return None + + +def parse_growth_media(wb): + for row in workbook_sheet_reader(wb, GROWTH_MEDIA): + gm = GrowthMedium() + gm.acronym = str(row['Acronym']) + gm.description = row['Description'] + gm.full_description = remove_hard_lines(row.get('Full description', None)) + + yield gm + + +def parse_publications(wb): + ids = [] + for row in workbook_sheet_reader(wb, LITERATURE_SHEET): + pub = Publication() + for pub_field in PUBLICATION_FIELDS: + label = pub_field["label"] + col_val = row.get(label, None) + + if col_val: + attribute = pub_field["attribute"] + setattr(pub, attribute, col_val) + yield pub + + +def parse_strains(wb, locations, growth_media, markers, publications, + ontobiotopes): + + ontobiotopes_by_id = {str(ont["ID"]): ont['Name'] for ont in ontobiotopes} + ontobiotopes_by_name = {v: k for k, v in ontobiotopes_by_id.items()} + + locations = index_list_by(locations, 'Locality') + growth_media = index_list_by_attr(growth_media, 'acronym') + publications = index_list_by_attr(publications, 'id') + markers = index_markers(markers) + + for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"): + strain = StrainMirri() + strain_id = None + label = None + for field in MIRRI_FIELDS: + label = field["label"] + attribute = field["attribute"] + value = strain_row[label] + if value is None or value == '': + continue + + if attribute == "id": + collection, number = value.split(" ", 1) + value = StrainId(collection=collection, number=number) + rsetattr(strain, attribute, value) + + elif attribute == "restriction_on_use": + rsetattr(strain, attribute, RESTRICTION_USE_TRANSLATOR[value]) + elif attribute == "nagoya_protocol": + rsetattr(strain, attribute, NAGOYA_TRANSLATOR[value]) + elif attribute == "other_numbers": + other_numbers = [] + for on in value.split(";"): + on = on.strip() + try: + collection, number = on.split(" ", 1) + except ValueError: + collection = None + number = on + _id = StrainId(collection=collection, number=number) + other_numbers.append(_id) + rsetattr(strain, attribute, other_numbers) + elif attribute == "taxonomy.taxon_name": + try: + add_taxon_to_strain(strain, value) + except ValueError: + msg = f"The '{label}' for strain with Accession Number {strain_id} is not according to the specification." + raise ValidationError(msg) + elif attribute == "taxonomy.organism_type": + value = [OrganismType(val.strip()) + for val in str(value).split(";")] + rsetattr(strain, attribute, value) + elif attribute in ("deposit.date", "collect.date", "isolation.date", + "catalog_inclusion_date"): + if isinstance(value, date): + value = DateRange( + year=value.year, month=value.month, day=value.day + ) + elif isinstance(value, str): + value = DateRange().strpdate(value) + else: + raise NotImplementedError() + rsetattr(strain, attribute, value) + elif attribute == 'growth.recommended_temp': + temps = value.split(';') + if len(temps) == 1: + _min, _max = float(temps[0]), float(temps[0]) + else: + _min, _max = float(temps[0]), float(temps[1]) + rsetattr(strain, attribute, {'min': _min, 'max': _max}) + elif attribute == "growth.recommended_media": + sep = "/" + if ";" in value: + sep = ";" + growth_media = [v.strip() for v in value.split(sep)] + rsetattr(strain, attribute, growth_media) + elif attribute == 'growth.tested_temp_range': + if value: + min_, max_ = value.split(";") + value = {'min': float(min_), 'max': float(max_)} + rsetattr(strain, attribute, value) + elif attribute == "form_of_supply": + rsetattr(strain, attribute, value.split(";")) + elif attribute == "collect.location.coords": + items = value.split(";") + strain.collect.location.latitude = float(items[0]) + strain.collect.location.longitude = float(items[1]) + if len(items) > 2: + strain.collect.location.coord_uncertainty = items[2] + + elif attribute == "collect.location": + location = locations[value] + if 'Country' in location and location['Country']: + if location['Country'] == 'Unknown': + continue + country_3 = _get_country_alpha3(location['Country']) + strain.collect.location.country = country_3 + strain.collect.location.state = location["Region"] + strain.collect.location.municipality = location["City"] + strain.collect.location.site = location["Locality"] + elif attribute in ("abs_related_files", "mta_files"): + rsetattr(strain, attribute, value.split(";")) + elif attribute in ("is_from_registered_collection", + "is_subject_to_quarantine", 'taxonomy.interspecific_hybrid', + "is_potentially_harmful", "genetics.gmo"): + rsetattr(strain, attribute, TRUEFALSE_TRANSLATOR[value]) + elif attribute == "publications": + value = str(value) + pubs = [] + pub_ids = [v.strip() for v in str(value).split(";")] + for pub_id in pub_ids: + pub = publications.get(pub_id, None) + if pub is None: + pub = Publication() + if '/' in pub_id: + pub.doi = pub_id + else: + pub.pubmed_id = pub_id + pubs.append(pub) + rsetattr(strain, attribute, pubs) + elif attribute == 'ontobiotope': + values = [] + for val in value.split(';'): + if val not in ontobiotopes_by_id: + val = ontobiotopes_by_name[val] + values.append(val) + rsetattr(strain, attribute, value) + elif attribute == 'other_denominations': + value = [v.strip() for v in value.split(';')] + rsetattr(strain, attribute, value) + elif attribute == 'genetics.plasmids': + value = [v.strip() for v in value.split(';')] + rsetattr(strain, attribute, value) + else: + #print(attribute, value, type(value)) + rsetattr(strain, attribute, value) + + # add markers + strain_id = strain.id.strain_id + if strain_id in markers: + for marker in markers[strain_id]: + _marker = GenomicSequenceBiolomics() + _marker.marker_id = marker["INSDC AN"] + _marker.marker_type = marker["Marker"] + _marker.marker_seq = marker["Sequence"] + strain.genetics.markers.append(_marker) + yield strain + + +def _get_country_alpha3(loc_country): + if loc_country == 'INW': + return loc_country + country = get_country_from_name(loc_country) + if not country: + country = pycountry.countries.get(alpha_3=loc_country) + if not country: + country = pycountry.historic_countries.get(alpha_3=loc_country) + country_3 = country.alpha_3 + return country_3 diff --git a/mirri/io/writers/__init__.py b/mirri/io/writers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/io/writers/mirri_excel.py b/mirri/io/writers/mirri_excel.py new file mode 100644 index 0000000..b4cb4ac --- /dev/null +++ b/mirri/io/writers/mirri_excel.py @@ -0,0 +1,305 @@ +import csv +from copy import deepcopy +from openpyxl.workbook.workbook import Workbook + + +from mirri import rgetattr +from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS +from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR + +INITIAL_SEXUAL_STATES = [ + "Mata", + "Matalpha", + "Mata/Matalpha", + "Mata", + "Matb", + "Mata/Matb", + "MTLa", + "MTLalpha", + "MTLa/MTLalpha", + "MAT1-1", + "MAT1-2", + "MAT1", + "MAT2", + "MT+", + "MT-", + "MT+", + "MT-", + "H+", + "H-", +] +MARKER_FIELDS = [ + {"attribute": "acronym", "label": "Acronym", "mandatory": True}, + {"attribute": "marker", "label": "Marker", "mandatory": True}, +] +MARKER_DATA = [ + {"acronym": "16S rRNA", "marker": "16S rRNA"}, + {"acronym": "ACT", "marker": "Actin"}, + {"acronym": "CaM", "marker": "Calmodulin"}, + {"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"}, + {"acronym": "ITS", "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"}, + {"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"}, + {"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"}, + {"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"}, + {"acronym": "TUBB", "marker": "β-Tubulin"}, +] + +REV_RESTRICTION_USE_TRANSLATOR = {v: k for k, v in RESTRICTION_USE_TRANSLATOR.items()} +REV_NAGOYA_TRANSLATOR = {v: k for k, v in NAGOYA_TRANSLATOR.items()} +PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS] + + +def write_mirri_excel(path, strains, growth_media, version): + if version == "20200601": + _write_mirri_excel_20200601(path, strains, growth_media) + + +def _write_mirri_excel_20200601(path, strains, growth_media): + wb = Workbook() + + write_markers_sheet(wb) + + ontobiotope_path = DATA_DIR / "ontobiotopes.csv" + write_ontobiotopes(wb, ontobiotope_path) + + write_growth_media(wb, growth_media) + growth_media_indexes = [str(gm.acronym) for gm in growth_media] + + locations = {} + publications = {} + sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES)) + genomic_markers = {} + strains_data = _deserialize_strains(strains, locations, growth_media_indexes, + publications, sexual_states, genomic_markers) + strains_data = list(strains_data) + + # write strain to generate indexed data + strain_sheet = wb.create_sheet("Strains") + strain_sheet.append([field["label"] for field in MIRRI_FIELDS]) + for strain_row in strains_data: + strain_sheet.append(strain_row) + redimension_cell_width(strain_sheet) + + # write locations + loc_sheet = wb.create_sheet("Geographic origin") + loc_sheet.append(["ID", "Country", "Region", "City", "Locality"]) + for index, loc_index in enumerate(locations.keys()): + location = locations[loc_index] + row = [index, location.country, location.state, location.municipality, + loc_index] + loc_sheet.append(row) + redimension_cell_width(loc_sheet) + + # write publications + pub_sheet = wb.create_sheet("Literature") + pub_sheet.append(PUB_HEADERS) + for publication in publications.values(): + row = [] + for pub_field in PUBLICATION_FIELDS: + # if pub_field['attribute'] == 'id': + # value = index + value = getattr(publication, pub_field['attribute'], None) + row.append(value) + pub_sheet.append(row) + redimension_cell_width(pub_sheet) + + # write sexual states + sex_sheet = wb.create_sheet("Sexual states") + for sex_state in sorted(list(sexual_states)): + sex_sheet.append([sex_state]) + redimension_cell_width(sex_sheet) + + # write genetic markers + markers_sheet = wb.create_sheet("Genomic information") + markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence']) + for strain_id, markers in genomic_markers.items(): + for marker in markers: + row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq] + markers_sheet.append(row) + redimension_cell_width(markers_sheet) + + del wb["Sheet"] + wb.save(str(path)) + + +def _deserialize_strains(strains, locations, growth_media_indexes, + publications, sexual_states, genomic_markers): + for strain in strains: + strain_row = [] + for field in MIRRI_FIELDS: + attribute = field["attribute"] + + if attribute == "id": + value = strain.id.strain_id + elif attribute == "restriction_on_use": + value = rgetattr(strain, attribute) + if value is not None: + value = REV_RESTRICTION_USE_TRANSLATOR[value] + elif attribute == "nagoya_protocol": + value = rgetattr(strain, attribute) + if value: + value = REV_NAGOYA_TRANSLATOR[value] + elif attribute == "other_numbers": + value = rgetattr(strain, attribute) + if value is not None: + value = [f"{on.collection} {on.number}" for on in value] + value = "; ".join(value) + elif attribute == 'other_denominations': + od = strain.other_denominations + value = "; ".join(od) if od else None + elif attribute in ( + "is_from_registered_collection", + "is_subject_to_quarantine", + "is_potentially_harmful", + "genetics.gmo", + "taxonomy.interspecific_hybrid" + ): + value = rgetattr(strain, attribute) + if value is True: + value = 2 + elif value is False: + value = 1 + else: + value = None + elif attribute == "taxonomy.taxon_name": + value = strain.taxonomy.long_name + elif attribute in ("deposit.date", "collect.date", "isolation.date", + 'catalog_inclusion_date'): + value = rgetattr(strain, attribute) + value = value.strfdate if value else None + elif attribute == "growth.recommended_media": + value = rgetattr(strain, attribute) + if value is not None: + for gm in value: + gm = str(gm) + if gm not in growth_media_indexes: + print(gm, growth_media_indexes) + msg = f"Growth media {gm} not in the provided ones" + continue + raise ValueError(msg) + value = "/".join(value) + elif attribute in ('growth.tested_temp_range', + "growth.recommended_temp"): + value = rgetattr(strain, attribute) + if value: + value = f'{value["min"]}; {value["max"]}' + elif attribute == "form_of_supply": + value = rgetattr(strain, attribute) + value = ";".join(value) + elif attribute == "collect.location.coords": + lat = strain.collect.location.latitude + long = strain.collect.location.longitude + if lat is not None and long is not None: + value = f"{lat};{long}" + else: + value = None + + elif attribute == "collect.location": + location = strain.collect.location + loc_index = _build_location_index(location) + if loc_index is None: + continue + if loc_index not in locations: + locations[loc_index] = location + value = loc_index + elif attribute in ("abs_related_files", "mta_files"): + value = rgetattr(strain, attribute) + value = ";".join(value) if value else None + elif attribute == "taxonomy.organism_type": + value = rgetattr(strain, attribute) + if value: + value = "; ".join([str(v.code) for v in value]) + + elif attribute == "history": + value = rgetattr(strain, attribute) + if value is not None: + value = " < ".join(value) + elif attribute == "genetics.sexual_state": + value = rgetattr(strain, attribute) + if value: + sexual_states.add(value) + elif attribute == "genetics.ploidy": + value = rgetattr(strain, attribute) + elif attribute == "taxonomy.organism_type": + organism_types = rgetattr(strain, attribute) + if organism_types is not None: + value = [org_type.code for org_type in organism_types] + value = ";".join(value) + elif attribute == 'publications': + value = [] + for pub in strain.publications: + value.append(pub.id) + if pub.id not in publications: + publications[pub.id] = pub + value = ';'.join(str(v) for v in value) if value else None + elif attribute == 'genetics.plasmids': + value = rgetattr(strain, attribute) + if value is not None: + value = ';'.join(value) + else: + value = rgetattr(strain, attribute) + + strain_row.append(value) + genomic_markers[strain.id.strain_id] = strain.genetics.markers + yield strain_row + + +def _build_location_index(location): + index = [] + if location.country: + index.append(location.country) + if location.site: + index.append(location.site) + return ';'.join(index) if index else None + + +def write_markers_sheet(wb): + sheet = wb.create_sheet("Markers") + _write_work_sheet( + sheet, + labels=[f["label"] for f in MARKER_FIELDS], + attributes=[f["attribute"] for f in MARKER_FIELDS], + data=MARKER_DATA, + ) + redimension_cell_width(sheet) + + +def write_ontobiotopes(workbook, ontobiotype_path): + ws = workbook.create_sheet("Ontobiotope") + with ontobiotype_path.open() as fhand: + for row in csv.reader(fhand, delimiter="\t"): + ws.append(row) + redimension_cell_width(ws) + + +def _write_work_sheet(sheet, labels, attributes, data): + sheet.append(labels) + for row in data: + row_data = [row[field] for field in attributes] + sheet.append(row_data) + + redimension_cell_width(sheet) + + +def write_growth_media(wb, growth_media): + ws = wb.create_sheet(GROWTH_MEDIA) + ws.append(["Acronym", "Description", "Full description"]) + for growth_medium in growth_media: + row = [ + growth_medium.acronym, + growth_medium.description, + growth_medium.full_description, + ] + ws.append(row) + redimension_cell_width(ws) + + +def redimension_cell_width(ws): + dims = {} + for row in ws.rows: + for cell in row: + if cell.value: + max_ = max((dims.get(cell.column_letter, 0), len(str(cell.value)))) + dims[cell.column_letter] = max_ + for col, value in dims.items(): + ws.column_dimensions[col].width = value diff --git a/mirri/settings.py b/mirri/settings.py new file mode 100644 index 0000000..8f731f6 --- /dev/null +++ b/mirri/settings.py @@ -0,0 +1,296 @@ +from pathlib import Path + +DATA_DIR = Path(__file__).parent / "data" + +ACCESSION_NUMBER = "accession_number" +RESTRICTION_ON_USE = "restriction_on_use" +NAGOYA_PROTOCOL = "nagoya_protocol" +ABS_RELATED_FILES = "abs_related_files" +MTA_FILES = "mta_file" +OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers" +STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection" +RISK_GROUP = "risk_group" +DUAL_USE = "dual_use" +QUARANTINE = "quarantine" +ORGANISM_TYPE = "organism_type" +TAXON_NAME = "taxon_name" +INFRASUBSPECIFIC_NAME = "infrasubspecific_names" +COMMENTS_ON_TAXONOMY = "comments_on_taxonomy" +STATUS = "status" +HISTORY_OF_DEPOSIT = "history_of_deposit" +DEPOSITOR = "depositor" +DATE_OF_DEPOSIT = "date_of_deposit" +COLLECTED_BY = "collected_by" +DATE_OF_COLLECTION = "date_of_collection" +ISOLATED_BY = "isolated_by" +DATE_OF_ISOLATION = "date_of_isolation" +DATE_OF_INCLUSION = "date_of_inclusion_on_catalog" +TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range" +RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature" +RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth" +FORM_OF_SUPPLY = "form_of_supply" +GEO_COORDS = "coordinates_of_geographic_origin" +ACCESSION_NAME = "other_denomination" +ALTITUDE = "altitude_of_geographic_origin" +GEOGRAPHIC_ORIGIN = "geographic_origin" +GMO = "gmo" +GMO_CONSTRUCTION_INFO = "gmo_construction_information" +MUTANT_INFORMATION = "mutant_information" +GENOTYPE = "genotype" +LITERATURE = "literature" +SEXUAL_STATE = "sexual_state" +PLOIDY = "ploidy" +INTERSPECIFIC_HYBRID = "interspecific_hybrid" +HYBRIDS = 'hybrids' +PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code" +PATHOGENICITY = "pathogenicity" +ENZYME_PRODUCTION = "enzyme_production" +PRODUCTION_OF_METABOLITES = "production_of_metabolites" +APPLICATIONS = "applications" +REMARKS = "remarks" +PLASMIDS = "plasmids" +PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields" +SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation" +ISOLATION_HABITAT = "isolation_habitat" +ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat" +LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome" + +# StrainId +STRAIN_ID = "id" +COLLECTION_CODE = "collection_code" +STRAIN_PUI = "strain_pui" +STRAIN_URL = "strain_url" + +ID_SYNONYMS = 'id_synonyms' +# Taxonomy +GENUS = "genus" +SPECIES = "species" + +# Location +COUNTRY = "countryOfOriginCode" +SITE = "site" +STATE = "state" +PROVINCE = "province" +MUNICIPALITY = "municipality" +ISLAND = "island" +OTHER = "other" +LATITUDE = "latitude" +LONGITUDE = "longitude" +ALTITUDE = "altitude" +GEOREF_METHOD = "georeferencingMethod" +COORDUNCERTAINTY = "coordUncertainty" +COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference" +LOCATION = "location" + +ALLOWED_COLLECTING_SITE_KEYS = [ + COUNTRY, + STATE, + PROVINCE, + ISLAND, + MUNICIPALITY, + OTHER, + SITE, + LATITUDE, + LONGITUDE, + ALTITUDE, + GEOREF_METHOD, + COORDUNCERTAINTY, + COORD_SPATIAL_REFERENCE, +] + +MIRRI_FIELDS = [ + {"attribute": "id", "label": "Accession number"}, + {"attribute": "restriction_on_use", "label": "Restrictions on use"}, + {"attribute": "nagoya_protocol", + "label": "Nagoya protocol restrictions and compliance conditions"}, + {"attribute": ABS_RELATED_FILES, "label": "ABS related files"}, + {"attribute": "mta_files", "label": "MTA file"}, + {"attribute": "other_numbers", "label": "Other culture collection numbers"}, + {"attribute": "is_from_registered_collection", + "label": "Strain from a registered collection"}, + {"attribute": "risk_group", "label": "Risk Group"}, + {"attribute": "is_potentially_harmful", "label": "Dual use"}, + {"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"}, + {"attribute": "taxonomy.organism_type", "label": "Organism type"}, + {"attribute": "taxonomy.taxon_name", "label": "Taxon name"}, + {"attribute": "taxonomy.infrasubspecific_name", + "label": "Infrasubspecific names"}, + {"attribute": "taxonomy.comments", "label": "Comment on taxonomy"}, + {"attribute": "taxonomy.interspecific_hybrid", + "label": "Interspecific hybrid"}, + {"attribute": "status", "label": "Status"}, + {"attribute": "history", "label": "History of deposit", }, + {"attribute": "deposit.who", "label": "Depositor"}, + {"attribute": "deposit.date", "label": "Date of deposit"}, + {"attribute": "catalog_inclusion_date", + "label": "Date of inclusion in the catalogue"}, + {"attribute": "collect.who", "label": "Collected by"}, + {"attribute": "collect.date", "label": "Date of collection"}, + {"attribute": "isolation.who", "label": "Isolated by"}, + {"attribute": "isolation.date", "label": "Date of isolation"}, + {"attribute": "isolation.substrate_host_of_isolation", + "label": "Substrate/host of isolation"}, + {"attribute": "growth.tested_temp_range", + "label": "Tested temperature growth range"}, + {"attribute": "growth.recommended_temp", + "label": "Recommended growth temperature"}, + {"attribute": "growth.recommended_media", + "label": "Recommended medium for growth"}, + {"attribute": "form_of_supply", "label": "Form of supply"}, + {"attribute": "other_denominations", "label": "Other denomination"}, + {"attribute": "collect.location.coords", + "label": "Coordinates of geographic origin"}, + {"attribute": "collect.location.altitude", + "label": "Altitude of geographic origin"}, + {"attribute": "collect.location", "label": "Geographic origin"}, + {"attribute": "collect.habitat", "label": "Isolation habitat"}, + {"attribute": "collect.habitat_ontobiotope", + "label": "Ontobiotope term for the isolation habitat"}, + {"attribute": "genetics.gmo", "label": "GMO"}, + {"attribute": "genetics.gmo_construction", + "label": "GMO construction information"}, + {"attribute": "genetics.mutant_info", "label": "Mutant information"}, + {"attribute": "genetics.genotype", "label": "Genotype"}, + {"attribute": "genetics.sexual_state", "label": "Sexual state"}, + {"attribute": "genetics.ploidy", "label": "Ploidy"}, + {"attribute": "genetics.plasmids", "label": "Plasmids"}, + {"attribute": "genetics.plasmids_in_collections", + "label": "Plasmids collections fields"}, + {"attribute": "publications", "label": "Literature"}, + {"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"}, + {"attribute": "pathogenicity", "label": "Pathogenicity"}, + {"attribute": "enzyme_production", "label": "Enzyme production"}, + {"attribute": "production_of_metabolites", + "label": "Production of metabolites"}, + {"attribute": "applications", "label": "Applications", }, + {"attribute": "remarks", "label": "Remarks"}, + {"attribute": LITERATURE_LINKED_TO_SEQ_GENOME, + "label": "Literature linked to the sequence/genome"}, +] + +ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma", + 'forma.specialis'] +ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA + +# nagoya +NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol" +NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection" +NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection" + +ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS, + NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE] + +# Use restriction +NO_RESTRICTION = "no_restriction" +ONLY_RESEARCH = "only_research" +COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement" + +ALLOWED_RESTRICTION_USE_OPTIONS = [ + NO_RESTRICTION, + ONLY_RESEARCH, + COMMERCIAL_USE_WITH_AGREEMENT, +] + +ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"] + +AGAR = "Agar" +CRYO = "Cryo" +DRY_ICE = "Dry Ice" +LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium" +LYO = "Lyo" +OIL = "Oil" +WATER = "Water" +ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE, + LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER] + +DEPOSIT = "deposit" +ISOLATION = "isolation" +COLLECT = "collect" +GROWTH = "growth" +GENETICS = "genetics" +TAXONOMY = "taxonomy" +# Markers +MARKERS = "markers" +MARKER_TYPE = "marker_type" +MARKER_INSDC = "INSDC" +MARKER_SEQ = "marker_seq" +ALLOWED_MARKER_TYPES = [ + {"acronym": "16S rRNA", "marker": "16S rRNA"}, + {"acronym": "ACT", "marker": "Actin"}, + {"acronym": "CaM", "marker": "Calmodulin"}, + {"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"}, + {"acronym": "ITS", + "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"}, + {"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"}, + {"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"}, + {"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"}, + {"acronym": "TUBB", "marker": "β-Tubulin"}, +] + +PUBLICATIONS = "publications" +PUB_ID = "id" +PUB_DOI = "pub_doi" +PUB_PUBMED_ID = '' +PUB_FULL_REFERENCE = "full_reference" +PUB_TITLE = "title" +PUB_AUTHORS = "authors" +PUB_JOURNAL = "journal" +PUB_YEAR = "year" +PUB_VOLUME = "volume" +PUB_ISSUE = "issue" +PUB_FIRST_PAGE = "first_page" +PUB_LAST_PAGE = "last_page" +BOOK_TITLE = "book_title" +BOOK_EDITOR = "book_editor" +BOOK_PUBLISHER = "book_publisher" + + +PUBLICATION_FIELDS = [ + {"label": "ID", "attribute": PUB_ID}, + {"label": "Full reference", "attribute": PUB_FULL_REFERENCE}, + {"label": "Authors", "attribute": PUB_AUTHORS}, + {"label": "Title", "attribute": PUB_TITLE}, + {"label": "Journal", "attribute": PUB_JOURNAL}, + {"label": "Year", "attribute": PUB_YEAR}, + {"label": "Volume", "attribute": PUB_VOLUME}, + {"label": "Issue", "attribute": PUB_ISSUE}, + {"label": "First page", "attribute": PUB_FIRST_PAGE}, + {"label": "Last page", "attribute": PUB_FIRST_PAGE}, + {"label": "Book title", "attribute": BOOK_TITLE}, + {"label": "Editors", "attribute": BOOK_EDITOR}, + {"label": "Publisher", "attribute": BOOK_PUBLISHER}, +] + + +# ploidy +ANEUPLOID = 0 +HAPLOID = 1 +DIPLOID = 2 +TRIPLOID = 3 +TETRAPLOID = 4 +POLYPLOID = 9 + +ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID, + POLYPLOID] + +SUBTAXAS = { + "subsp.": "subspecies", + "var.": "variety", + "convar.": "convarietas", + "group.": "group", + "f.": "forma", + "f.sp.": "forma.specialis" +} + +# Excel sheet name +LOCATIONS = "Geographic origin" # 'Locations' +GROWTH_MEDIA = "Growth media" +GENOMIC_INFO = "Genomic information" +STRAINS = "Strains" +LITERATURE_SHEET = "Literature" +SEXUAL_STATE_SHEET = "Sexual states" +RESOURCE_TYPES_VALUES = "Resource types values" +FORM_OF_SUPPLY_SHEET = "Forms of supply" +PLOIDY_SHEET = "Ploidy" +ONTOBIOTOPE = "Ontobiotope" +MARKERS = "Markers" diff --git a/mirri/utils.py b/mirri/utils.py new file mode 100644 index 0000000..afba3e3 --- /dev/null +++ b/mirri/utils.py @@ -0,0 +1,48 @@ +import pycountry + + +class FakeCountry: + def __init__(self, name=None, code3=None): + self.code3 = code3 + self.name = name + + +def get_pycountry(value): + if value == 'INW': + return FakeCountry(name='International Water', code3='INW') + + country = get_country_from_name(value) + if country is None: + country = get_country_from_alpha3(value) + return country + + +def get_country_from_name(name): + country = pycountry.countries.get(name=name) + try: + if country is None: + country = pycountry.countries.get(common_name=name) + if country is None: + country = pycountry.countries.get(official_name=name) + if country is None: + country = pycountry.historic_countries.get(name=name) + if country is None: + country = pycountry.historic_countries.get(common_name=name) + if country is None: + country = pycountry.historic_countries.get(official_name=name) + except (AttributeError, KeyError): + country = None + + return country + + +def get_country_from_alpha3(code): + country = pycountry.countries.get(alpha_3=code) + try: + if country is None: + country = pycountry.historic_countries.get(alpha_3=code) + + except (AttributeError, KeyError): + country = None + + return country diff --git a/mirri/validation/__init__.py b/mirri/validation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mirri/validation/entity_validators.py b/mirri/validation/entity_validators.py new file mode 100644 index 0000000..e1e02d0 --- /dev/null +++ b/mirri/validation/entity_validators.py @@ -0,0 +1,50 @@ +from mirri import rgetattr + + +def validate_strain(strain, version='20200601'): + if version == '20200601': + return _validate_strain_v20200601(strain) + raise NotImplementedError('Only v20200601 is implemented') + + +def _validate_strain_v20200601(strain): + mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'}, + {'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'}, + {'label': 'Restriction on use', 'attr': 'restriction_on_use'}, + {'label': 'Risk group', 'attr': 'risk_group'}, + {'label': 'Organism type', 'attr': 'taxonomy.organism_type'}, + {'label': 'Taxon name', 'attr': 'taxonomy.long_name'}, + {'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'}, + {'label': 'Recommended media', 'attr': 'growth.recommended_media'}, + {'label': 'Form of supply', 'attr': 'form_of_supply'}, + {'label': 'Country', 'attr': 'collect.location.country'}] + + errors = [] + + for mandatory in mandatory_attrs: + value = rgetattr(strain, mandatory['attr']) + if value is None: + errors.append(f"{mandatory['label']} is mandatory field") + + if not is_valid_nagoya(strain): + errors.append('Not compliant wih nagoya protocol requirements') + + return errors + + +def is_valid_nagoya(strain): + # nagoya_requirements + _date = strain.collect.date + if _date is None: + _date = strain.isolation.date + if _date is None: + _date = strain.deposit.date + if _date is None: + _date = strain.catalog_inclusion_date + # print(_date) + year = None if _date is None else _date._year + + if year is not None and year >= 2014 and strain.collect.location.country is None: + return False + + return True diff --git a/mirri/validation/error_logging/__init__.py b/mirri/validation/error_logging/__init__.py new file mode 100644 index 0000000..d2a9098 --- /dev/null +++ b/mirri/validation/error_logging/__init__.py @@ -0,0 +1,3 @@ +from .error import Entity, Error +from .error_message import ErrorMessage +from .error_log import ErrorLog \ No newline at end of file diff --git a/mirri/validation/error_logging/error.py b/mirri/validation/error_logging/error.py new file mode 100644 index 0000000..2e65c8b --- /dev/null +++ b/mirri/validation/error_logging/error.py @@ -0,0 +1,119 @@ +from typing import Optional +from .error_message import ErrorMessage + + +class Entity(): + """Entity information + + Args: + acronym: acronym of the entity. Must be a 3-characters captalized string + """ + + def __init__(self, acronym: str) -> None: + self.acronym = acronym + + def __str__(self) -> str: + return f"Entity {self.acronym}: {self.name}" + + @property + def _acronyms(self) -> list: + return [ + func + for func in dir(self) + if func.isupper() and + callable(getattr(self, func)) and + not func.startswith("__") + ] + + @property + def _names(self) -> dict: + return {acr: getattr(self, acr)() for acr in self._acronyms} + + @property + def name(self) -> str: + try: + return self._names[self.acronym] + except KeyError: + raise KeyError(f'Unknown acronym {self.acronym}.') + + @property + def acronym(self) -> str: + return self._acronym + + @acronym.setter + def acronym(self, acronym: str) -> None: + self._acronym = acronym + + def EFS(self) -> str: + return 'Excel File Structure' + + def GMD(self) -> str: + return 'Growth Media' + + def GOD(self) -> str: + return 'Geographic Origin' + + def LID(self) -> str: + return 'Literature' + + def STD(self) -> str: + return 'Strains' + + def GID(self) -> str: + return 'Genomic Information' + + def OTD(self) -> str: + return 'Ontobiotope' + + def UCT(self) -> str: + return 'Uncategorized' + + +class Error(): + """Error information + + Args: + message (str): Error message + entity (Entity, optional): Entity related to the error. If None will default to Uncategorized. Defaults to None. + data (str, optional): Data used for sorting the messages. Defaults to None. + """ + + def __init__(self, code: str, pk: Optional[str] = None, data: Optional[str] = None) -> None: + self.code = code.upper() + self.pk = pk + self.data = data + + def __str__(self): + return f"Error {self._code}: {self.message}" + + @property + def code(self) -> str: + return self._code + + @code.setter + def code(self, code: str) -> None: + self._code = code.upper() + + @property + def pk(self) -> Optional[str]: + return self._pk + + @pk.setter + def pk(self, pk: Optional[str] = None) -> None: + self._pk = pk + + @property + def data(self) -> Optional[str]: + return self._data + + @data.setter + def data(self, data: Optional[str]): + self._data = data + + @property + def entity(self) -> Entity: + return Entity(self.code[:3]) + + @property + def message(self) -> str: + return ErrorMessage(self.code, self.pk, self.data).message diff --git a/mirri/validation/error_logging/error_log.py b/mirri/validation/error_logging/error_log.py new file mode 100644 index 0000000..0b655b9 --- /dev/null +++ b/mirri/validation/error_logging/error_log.py @@ -0,0 +1,77 @@ +from typing import Optional, Union +from datetime import datetime +from .error import Error + + +class ErrorLog(): + def __init__(self, input_filename: str, cc: Optional[str] = None, date: Optional[Union[str, datetime]] = None, limit: int = 100): + """ + Logger for Error instances. + + Args: + input_filename (str): name of the file to be logged + cc (str, optional): name of the curator. Defaults to None. + date (str, optional): date (e.g. created, last modified) associated with the file. Useful for versioning. Defaults to None. + limit (int, optional): limit of errors to print to the report. Defaults to 100. + """ + self._input_filename = input_filename + self._cc = cc + self._date = date + self._errors = {} + self.limit = limit + self._counter = 0 + + def __str__(self) -> str: + output = f"""Error Log for file {self._input_filename}\nENTITY | CODE | MESSAGE""" + for acronym, error_list in self.get_errors().items(): + for error in error_list: + output += f"\n{acronym:6} | {error.code:6} | {error.message[:100]}" + return output + + @property + def input_filename(self) -> str: + return self._input_filename + + @input_filename.setter + def input_filename(self, input_filename: str) -> None: + self._input_filename = input_filename + + @property + def cc(self) -> Optional[str]: + return self._cc + + @cc.setter + def cc(self, cc: Optional[str]) -> None: + self._cc = cc + + @property + def date(self) -> Optional[Union[str, datetime]]: + return self._date + + @date.setter + def date(self, date: Optional[Union[str, datetime]] = None) -> None: + if isinstance(date, str): + self._date = datetime.strptime(date, r'%d-%m-%Y') + else: + self._date = date + + def get_errors(self) -> dict: + """ + Get all errors + + Returns: + dict: Error intances grouped by entity acronym. + """ + return self._errors + + def add_error(self, error: Error) -> None: + """ + Add an error. + + Args: + error (Error): Error instance. + """ + if error.entity.acronym not in self._errors: + self._errors[error.entity.acronym] = [error] + else: + self._errors[error.entity.acronym].append(error) diff --git a/mirri/validation/error_logging/error_message.py b/mirri/validation/error_logging/error_message.py new file mode 100644 index 0000000..7e958bf --- /dev/null +++ b/mirri/validation/error_logging/error_message.py @@ -0,0 +1,408 @@ +from typing import Optional + + +class ErrorMessage(): + """Error message + + Args: + code (str): Error code. + pk (str | optional): The instance's primary key that triggered the error. Defaults to None. + value (str | optional): The instance's value that triggered the error. Defaults to None. + """ + + def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None): + self.code = code.upper() + self.pk = pk + self.value = value + + @property + def _codes(self) -> list: + return [ + func + for func in dir(self) + if func.isupper() and + callable(getattr(self, func)) and + not func.startswith("__") + ] + + @property + def _messages(self) -> dict: + return {code: getattr(self, code) for code in self._codes} + + @property + def message(self) -> str: + if not self._validate_code(): + raise ValueError(f"{self.code} not found") + return self._messages[self.code]() + + @property + def code(self) -> str: + return self._code + + @code.setter + def code(self, code: str) -> None: + self._code = code.upper() + + def _validate_code(self) -> bool: + return self.code in self._codes + + @property + def pk(self) -> str: + return self._pk + + @pk.setter + def pk(self, pk: str) -> None: + self._pk = pk + + @property + def value(self) -> str: + return self._value + + @value.setter + def value(self, value: str) -> None: + self._value = value + + """ + Excel File Structure Error Codes + """ + + def EXL00(self): + return f"The provided file '{self.pk}' is not an excel(xlsx) file" + + def EFS01(self): + return "The 'Growth media' sheet is missing. Please check the provided excel template." + + def EFS02(self): + return "The 'Geographic origin' sheet is missing. Please check the provided excel template." + + def EFS03(self): + return "The 'Literature' sheet is missing. Please check the provided excel template." + + def EFS04(self): + return "The 'Sexual state' sheet is missing. Please check the provided excel template." + + def EFS05(self): + return "The 'Strains' sheet is missing. Please check the provided excel template." + + def EFS06(self): + return "The 'Ontobiotope' sheet is missing. Please check the provided excel template." + + def EFS07(self): + return "The 'Markers' sheet is missing. Please check the provided excel template." + + def EFS08(self): + return "The 'Genomic information' sheet is missing. Please check the provided excel template." + + """ + Growth Media Error Codes + """ + + def GMD01(self): + return "The 'Acronym' column is a mandatory field in the Growth Media sheet." + + def GMD02(self): + return "The 'Acronym' column is empty or has missing values." + + def GMD03(self): + return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty." + + def GMD04(self): + return f"The 'Description' for growth media with Acronym {self.pk} is missing." + + """ + Geographic Origin Error Codes + """ + + def GOD01(self): + return "The 'ID' column is a mandatory field in the Geographic Origin sheet." + + def GOD02(self): + return "The 'ID' column is empty or has missing values." + + def GOD03(self): + return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." + + def GOD04(self): + return f"The 'Country' for geographic origin with ID {self.pk} is missing." + + def GOD05(self): + return f"The 'Country' for geographic origin with ID {self.pk} is incorrect." + + def GOD06(self): + return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty." + + def GOD07(self): + return f"The 'Locality' for geographic origin with ID {self.pk} is missing." + + """ + Literature Error Codes + """ + + def LID01(self): + return "The 'ID' column is a mandatory field in the Literature sheet." + + def LID02(self): + return "The 'ID' column empty or missing values." + + def LID03(self): + return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID04(self): + return f"The 'Full reference' for literature with ID {self.pk} is missing." + + def LID05(self): + return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID06(self): + return f"The 'Authors' for literature with ID {self.pk} is missing." + + def LID07(self): + return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID08(self): + return f"The 'Title' for literature with ID {self.pk} is missing." + + def LID09(self): + return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID10(self): + return f"The 'Journal' for literature with ID {self.pk} is missing." + + def LID11(self): + return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID12(self,): + return f"The 'Year' for literature with ID {self.pk} is missing." + + def LID13(self): + return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty." + + def LID14(self): + return f"The 'Volume' for literature with ID {self.pk} is missing." + + def LID15(self): + return "The 'First page' column is a mandatory field. The column can not be empty." + + def LID16(self): + return f"The 'First page' for literature with ID {self.pk} is missing." + + def LID17(self): + msg = 'If journal; Title, Authors, journal, year and first page are required' + msg += 'If Book; Book Title, Authors, Year, Editors, Publishers' + return msg + + """ + Strains Error Codes + """ + + def STD01(self): + return "The 'Accession number' column is a mandatory field in the Strains sheet." + + def STD02(self): + return "The 'Accession number' column is empty or has missing values." + + def STD03(self): + return f"The 'Accesion number' must be unique. The '{self.value}' is repeated." + + def STD04(self): + return (f"The 'Accession number' {self.pk} is not according to the specification." + " The value must be of the format ' '.") + + def STD05(self): + return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD06(self): + return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing." + + def STD07(self): + return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD08(self): + return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD09(self): + return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing." + + def STD10(self): + return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD11(self): + return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3.") + + def STD12(self): + return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD13(self): + return f"The 'Risk group' for strain with Accession Number {self.pk} is missing." + + def STD14(self): + return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.") + + def STD15(self): + return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD16(self): + return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD17(self): + return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD18(self): + return f"The 'Organism type' for strain with Accession Number {self.pk} is missing." + + def STD19(self): + return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', " + "'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.") + + def STD20(self): + return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD21(self): + return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing." + + def STD22(self): + return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect." + + def STD23(self): + return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2.") + + def STD24(self): + return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect." + + def STD25(self): + return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD26(self): + return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD27(self): + return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD28(self): + return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect." + " The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.") + + def STD29(self): + return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect." + " It must have two decimal numbers separated by ','") + + def STD30(self): + return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD31(self): + return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing." + + def STD32(self): + return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect." + " It must have two decimal numbers separated by ','.") + + def STD33(self): + return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD34(self): + return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing." + + def STD35(self): + return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet." + + def STD36(self): + return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty." + + def STD37(self): + return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing." + + def STD38(self): + return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet." + + def STD39(self): + return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect." + "The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be" + "between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.") + + def STD40(self): + return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect." + "The allowed formats are one decimal number between [-200, 8000].") + + def STD41(self): + return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet." + + def STD42(self): + return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 1, 2") + + def STD43(self): + return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', " + "'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'") + + def STD44(self): + return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification." + f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9") + + def STD45(self): + msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. " + msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage" + return msg + + + """ + Genomic Information Error Codes + """ + + def GID01(self): + return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet." + + def GID02(self): + return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values." + + def GID03(self): + return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet." + + def GID04(self): + return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." + + def GID05(self): + return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing." + + def GID06(self): + return f"The 'Marker' for genomic information with Strain AN {self.pk} is incorrect." + + def GID07(self): + return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty." + + def GID08(self): + return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing." + + def GID09(self): + return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect." + + def GID10(self): + return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect." + " It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.") + + """ + Ontobiotope Error Codes + """ + + def OTD01(self): + return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet." + + def OTD02(self): + return "The 'ID' columns is empty or has missing values." + + def OTD03(self): + return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty." + + def OTD04(self): + return f"The 'Name' for ontobiotope with ID {self.pk} is missing." diff --git a/mirri/validation/excel_validator.py b/mirri/validation/excel_validator.py new file mode 100644 index 0000000..3b8e946 --- /dev/null +++ b/mirri/validation/excel_validator.py @@ -0,0 +1,483 @@ +import re +from pathlib import Path +from io import BytesIO +from zipfile import BadZipfile +from datetime import datetime +from calendar import monthrange + +from openpyxl import load_workbook + +from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet +from mirri.validation.error_logging import ErrorLog, Error +from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, + ERROR_CODE, FIELD, MANDATORY, MATCH, + MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, + TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO) +from mirri.settings import LOCATIONS, SUBTAXAS +from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF + + +def validate_mirri_excel(fhand, version="20200601"): + if version == "20200601": + configuration = MIRRI_20200601_VALLIDATION_CONF + else: + raise NotImplementedError("Only version20200601 is implemented") + + return validate_excel(fhand, configuration) + + +def validate_excel(fhand, configuration): + validation_conf = configuration['sheet_schema'] + cross_ref_conf = configuration['cross_ref_conf'] + in_memory_sheet_conf = configuration['keep_sheets_in_memory'] + excel_name = Path(fhand.name).stem + error_log = ErrorLog(excel_name) + + try: + workbook = load_workbook(filename=BytesIO( + fhand.read()), read_only=True, data_only=True) + except (BadZipfile, IOError): + error = Error('EXL00', fhand.name, fhand.name) + error_log.add_error(error) + return error_log + + # excel structure errors + structure_errors = list(validate_excel_structure(workbook, validation_conf)) + if structure_errors: + for error in structure_errors: + error = Error(error[ERROR_CODE], pk=error['id'], + data=error['value']) + error_log.add_error(error) + + return error_log + + crossrefs = get_all_crossrefs(workbook, cross_ref_conf) + in_memory_sheets = get_all_in_memory_sheet(workbook, in_memory_sheet_conf) + content_errors = validate_content(workbook, validation_conf, + crossrefs, in_memory_sheets) + + for error in content_errors: + # if error[ERROR_CODE] == 'STD43': + # continue + error = Error(error[ERROR_CODE], pk=error['id'], data=error['value']) + + error_log.add_error(error) + return error_log + + +def validate_excel_structure(workbook, validation_conf): + for sheet_name, sheet_conf in validation_conf.items(): + mandatory = sheet_conf.get(VALIDATION, {}).get(TYPE, None) + mandatory = mandatory == MANDATORY + + error_code = sheet_conf.get(VALIDATION, {}).get(ERROR_CODE, False) + try: + sheet = workbook[sheet_name] + except KeyError: + sheet = None + + if sheet is None: + if mandatory: + yield {'id': None, 'sheet': sheet_name, 'field': None, + 'error_code': error_code, 'value': None} + continue + + headers = _get_sheet_headers(sheet) + for column in sheet_conf.get(COLUMNS): + field = column[FIELD] + for step in column.get(VALIDATION, []): + if step[TYPE] == MANDATORY and field not in headers: + yield {'id': None, 'sheet': sheet_name, 'field': field, + 'error_code': step[ERROR_CODE], 'value': None} + + +def _get_sheet_headers(sheet): + first_row = next(sheet.iter_rows(min_row=1, max_row=1)) + return [c.value for c in first_row] + + +def _get_values_from_columns(workbook, sheet_name, columns): + indexed_values = {} + for row in workbook_sheet_reader(workbook, sheet_name): + for col in columns: + indexed_values[str(row.get(col))] = "" + + return indexed_values + + +def get_all_crossrefs(workbook, cross_refs_names): + crossrefs = {} + for ref_name, columns in cross_refs_names.items(): + if columns: + crossrefs[ref_name] = _get_values_from_columns(workbook, ref_name, + columns) + else: + try: + crossrefs[ref_name] = get_all_cell_data_from_sheet(workbook, ref_name) + except ValueError as error: + if 'sheet is missing' in str(error): + crossrefs[ref_name] = [] + else: + raise + + return crossrefs + + +def get_all_in_memory_sheet(workbook, in_memory_sheet_conf): + in_memory_sheets = {} + for sheet_conf in in_memory_sheet_conf: + sheet_name = sheet_conf['sheet_name'] + indexed_by = sheet_conf['indexed_by'] + rows = workbook_sheet_reader(workbook, sheet_name) + indexed_rows = {row[indexed_by]: row for row in rows} + in_memory_sheets[sheet_name] = indexed_rows + + return in_memory_sheets + + +def validate_content(workbook, validation_conf, crossrefs, in_memory_sheets): + for sheet_name in validation_conf.keys(): + sheet_conf = validation_conf[sheet_name] + sheet_id_column = sheet_conf['id_field'] + shown_values = {} + row_validation_steps = sheet_conf.get(ROW_VALIDATION, None) + for row in workbook_sheet_reader(workbook, sheet_name): + id_ = row.get(sheet_id_column, None) + if id_ is None: + error_code = _get_missing_row_id_error(sheet_id_column, + sheet_conf) + yield {'id': id_, 'sheet': sheet_name, + 'field': sheet_id_column, + 'error_code': error_code, 'value': None} + continue + do_have_cell_error = False + for column in sheet_conf[COLUMNS]: + label = column[FIELD] + validation_steps = column.get(VALIDATION, None) + value = row.get(label, None) + if validation_steps: + error_code = validate_cell(value, validation_steps, + crossrefs, shown_values, label) + if error_code is not None: + do_have_cell_error = True + yield {'id': id_, 'sheet': sheet_name, 'field': label, + 'error_code': error_code, 'value': value} + + if not do_have_cell_error and row_validation_steps: + error_code = validate_row( + row, row_validation_steps, in_memory_sheets) + if error_code is not None: + yield {'id': id_, 'sheet': sheet_name, 'field': 'row', + 'error_code': error_code, 'value': 'row'} + + +def _get_missing_row_id_error(sheet_id_column, sheet_conf): + error_code = None + for column in sheet_conf[COLUMNS]: + if column[FIELD] == sheet_id_column: + error_code = [step[ERROR_CODE] + for step in column[VALIDATION] if step[TYPE] == MISSING][0] + return error_code + + +def validate_row(row, validation_steps, in_memory_sheets): + for validation_step in validation_steps: + kind = validation_step[TYPE] + error_code = validation_step[ERROR_CODE] + if kind == NAGOYA: + if not is_valid_nagoya(row, in_memory_sheets): + return error_code + elif kind == BIBLIO: + if not is_valid_pub(row): + return error_code + else: + msg = f'{kind} is not a recognized row validation type method' + raise NotImplementedError(msg) + + +def validate_cell(value, validation_steps, crossrefs, shown_values, label): + + for step_conf in validation_steps: + if step_conf[TYPE] == MANDATORY: + continue + step_conf['crossrefs_pointer'] = crossrefs + step_conf['shown_values'] = shown_values + step_conf['label'] = label + error_code = validate_value(value, step_conf) + + if error_code is not None: + return error_code + + +def is_valid_pub(row): + title = row.get('Title', None) + full_reference = row.get('Full reference', None) + authors = row.get('Authors', None) + journal = row.get('Journal', None) + year = row.get('Year', None) + volumen = row.get('Volumen', None) + first_page = row.get('First page', None) + book_title = row.get('Book title', None) + editors = row.get('Editors', None) + publishers = row.get('Publishers', None) + + if full_reference: + return True + is_journal = bool(title) + + if (is_journal and (not authors or not journal or not not year or + not volumen or not first_page)): + return False + if (not is_journal and (not authors or not year or + not editors or not publishers or not book_title)): + return False + + return True + + +def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity + location_index = row.get('Geographic origin', None) + if location_index is None: + country = None + else: + geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {}) + country = geo_origin.get('Country', None) + + _date = row.get("Date of collection", None) + if _date is None: + _date = row.get("Date of isolation", None) + if _date is None: + _date = row.get("Date of deposit", None) + if _date is None: + _date = row.get("Date of inclusion in the catalogue", None) + if _date is not None: + year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4]) + else: + year = None + + if year is not None and year >= 2014 and country is None: + return False + + return True + + +def is_valid_regex(value, validation_conf): + if value is None: + return True + value = str(value) + regexp = validation_conf[MATCH] + multiple = validation_conf.get(MULTIPLE, False) + separator = validation_conf.get(SEPARATOR, None) + + values = [v.strip() for v in value.split( + separator)] if multiple else [value] + + for value in values: + matches_regexp = re.fullmatch(regexp, value) + if not matches_regexp: + return False + return True + + +def is_valid_crossrefs(value, validation_conf): + crossref_name = validation_conf[CROSSREF_NAME] + crossrefs = validation_conf['crossrefs_pointer'] + choices = crossrefs[crossref_name] + if value is None or not choices: + return True + value = str(value) + + multiple = validation_conf.get(MULTIPLE, False) + separator = validation_conf.get(SEPARATOR, None) + if value is None: + return True + if multiple: + values = [v.strip() for v in value.split(separator)] + else: + values = [value.strip()] + + return all(value in choices for value in values) + + +def is_valid_choices(value, validation_conf): + if value is None: + return True + choices = validation_conf[VALUES] + multiple = validation_conf.get(MULTIPLE, False) + separator = validation_conf.get(SEPARATOR, None) + + if multiple: + values = [v.strip() for v in str(value).split(separator)] + else: + values = [str(value).strip()] + + return all(value in choices for value in values) + + +def is_valid_date(value, validation_conf): + if value is None: + return True + if isinstance(value, datetime): + year = value.year + month = value.month + day = value.day + elif isinstance(value, int): + year = value + month = None + day = None + elif isinstance(value, str): + value = value.replace('-', '') + value = value.replace('/', '') + month = None + day = None + try: + year = int(value[: 4]) + if len(value) >= 6: + month = int(value[4: 6]) + if len(value) >= 8: + day = int(value[6: 8]) + + except (IndexError, TypeError, ValueError): + return False + else: + return False + + if year < 1700 or year > datetime.now().year: + return False + if month is not None: + if month < 1 or month > 13: + return False + if day is not None and (day < 1 or day > monthrange(year, month)[1]): + return False + return True + + +def is_valid_coords(value, validation_conf=None): + # sourcery skip: return-identity + if value is None: + return True + try: + items = [i.strip() for i in value.split(";")] + latitude = float(items[0]) + longitude = float(items[1]) + if len(items) > 2: + precision = float(items[2]) + if latitude < -90 or latitude > 90: + return False + if longitude < -180 or longitude > 180: + return False + return True + except: + return False + + +def is_valid_missing(value, validation_conf=None): + return value is not None + + +def is_valid_number(value, validation_conf): + if value is None: + return True + try: + value = float(value) + except TypeError: + return False + except ValueError: + return False + + _max = validation_conf.get('max', None) + _min = validation_conf.get('min', None) + if (_max is not None and value > _max) or (_min is not None and value < _min): + return False + + return True + + +def is_valid_taxon(value, validation_conf=None): + multiple = validation_conf.get(MULTIPLE, False) + separator = validation_conf.get(SEPARATOR, ';') + + value = value.split(separator) if multiple else [value] + for taxon in value: + taxon = taxon.strip() + if not _is_valid_taxon(taxon): + return False + return True + + +def _is_valid_taxon(value): + value = value.strip() + if not value: + return True + + items = re.split(r" +", value) + genus = items[0] + + if len(items) > 1: + species = items[1] + if species in ("sp", "spp", ".sp", "sp."): + return False + + if len(items) > 2: + for index in range(0, len(items[2:]), 2): + rank = SUBTAXAS.get(items[index + 2], None) + if rank is None: + print(value) + return False + + return True + + +def is_valid_unique(value, validation_conf): + label = validation_conf['label'] + shown_values = validation_conf['shown_values'] + if label not in shown_values: + shown_values[label] = {} + + already_in_file = shown_values[label] + if value in already_in_file: + return False + + # NOTE: what's the use of this? + # What is the expected format for value and shown_values? + shown_values[label][value] = None + + return True + + +def is_valid_file(path): + try: + with path.open("rb") as fhand: + error_log = validate_mirri_excel(fhand) + if "EXL" in error_log.get_errors(): + return False + except: + return False + + return True + + +VALIDATION_FUNCTIONS = { + MISSING: is_valid_missing, + REGEXP: is_valid_regex, + CHOICES: is_valid_choices, + CROSSREF: is_valid_crossrefs, + DATE: is_valid_date, + COORDINATES: is_valid_coords, + NUMBER: is_valid_number, + TAXON: is_valid_taxon, + UNIQUE: is_valid_unique} + + +def validate_value(value, step_conf): + kind = step_conf[TYPE] + try: + is_value_valid = VALIDATION_FUNCTIONS[kind] + except KeyError: + msg = f'This validation type {kind} is not implemented' + raise NotImplementedError(msg) + + error_code = step_conf[ERROR_CODE] + + if not is_value_valid(value, step_conf): + return error_code diff --git a/mirri/validation/tags.py b/mirri/validation/tags.py new file mode 100644 index 0000000..ef036c9 --- /dev/null +++ b/mirri/validation/tags.py @@ -0,0 +1,24 @@ +MANDATORY = "mandatory" +REGEXP = "regexp" +CHOICES = "choices" +CROSSREF = 'crossref' +CROSSREF_NAME = 'crossref_name' +MISSING = "missing" +VALIDATION = 'validation' +ERROR_CODE = 'error_code' +FIELD = 'field' +MULTIPLE = 'multiple' +TYPE = 'type' +COLUMNS = 'columns' +SOURCE = "sources" +SEPARATOR = "separator" +MATCH = 'match' +VALUES = 'values' +DATE = 'date' +COORDINATES = 'coord' +NUMBER = 'number' +TAXON = 'taxon' +UNIQUE = 'unique' +ROW_VALIDATION = 'row_validation' +NAGOYA = 'nagoya' +BIBLIO = 'bibliography' diff --git a/mirri/validation/validation_conf_20200601.py b/mirri/validation/validation_conf_20200601.py new file mode 100644 index 0000000..b8f5a8b --- /dev/null +++ b/mirri/validation/validation_conf_20200601.py @@ -0,0 +1,548 @@ +from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE, + ERROR_CODE, FIELD, MANDATORY, MATCH, + MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE, + UNIQUE, + VALIDATION, VALUES, BIBLIO) +from mirri.settings import (GEOGRAPHIC_ORIGIN, ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO, + STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET) +# MARKERS, +# SEXUAL_STATE_SHEET, +# RESOURCE_TYPES_VALUES, +# FORM_OF_SUPPLY_SHEET, +# PLOIDY_SHEET) + + +STRAIN_FIELDS = [ + { + FIELD: "Accession number", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: 'STD01'}, + {TYPE: UNIQUE, ERROR_CODE: 'STD03'}, + {TYPE: MISSING, ERROR_CODE: "STD02"}, + {TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"} + ] + }, + { + FIELD: "Restrictions on use", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD05"}, + {TYPE: MISSING, ERROR_CODE: "STD06"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3"], + MULTIPLE: False, ERROR_CODE: "STD07"} + ] + }, + { + FIELD: "Nagoya protocol restrictions and compliance conditions", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD08"}, + {TYPE: MISSING, ERROR_CODE: "STD09"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3"], + MULTIPLE: False, ERROR_CODE: "STD10"} + ] + }, + { + FIELD: "ABS related files", + VALIDATION: [], + }, + { + FIELD: "MTA file", + VALIDATION: [], + }, + { + FIELD: "Other culture collection numbers", + # VALIDATION: [ + # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07", + # MULTIPLE: True, SEPARATOR: ";"} + # ] + }, + { + FIELD: "Strain from a registered collection", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD11"} + ] + }, + { + FIELD: "Risk Group", + + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD12"}, + {TYPE: MISSING, ERROR_CODE: "STD13"}, + {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"], + MULTIPLE: False, ERROR_CODE: "STD14"} + ] + }, + { + FIELD: "Dual use", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD15"} + ] + }, + { + FIELD: "Quarantine in Europe", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD16"} + ] + }, + { + FIELD: "Organism type", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD17"}, + {TYPE: MISSING, ERROR_CODE: "STD18"}, + {TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria", + "Cyanobacteria", "Filamentous Fungi", + "Phage", "Plasmid", "Virus", "Yeast", + "1", "2", "3", "4", "5", "6", "7", "8", "9"], + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"} + ] + }, + { + FIELD: "Taxon name", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD20"}, + {TYPE: MISSING, ERROR_CODE: "STD21"}, + {TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True, + SEPARATOR: ';'} + ] + }, + { + FIELD: "Infrasubspecific names", + }, + { + FIELD: "Comment on taxonomy", + }, + { + FIELD: "Interspecific hybrid", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD23"} + ] + }, + { + FIELD: "Status", + }, + { + FIELD: "History of deposit", + VALIDATION: [ + # {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex + # MULTIPLE: True, SEPARATOR: ";"} + ] + }, + { + FIELD: "Depositor" + }, + { + FIELD: "Date of deposit", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD25"}, + ] + }, + { + FIELD: "Date of inclusion in the catalogue", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD26"}, + ] + }, + { + FIELD: "Collected by", + }, + { + FIELD: "Date of collection", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD27"}, + ] + }, + { + FIELD: "Isolated by", + }, + { + FIELD: "Date of isolation", + VALIDATION: [ + {TYPE: DATE, ERROR_CODE: "STD28"}, + ] + }, + { + FIELD: "Substrate/host of isolation", + }, + { + FIELD: "Tested temperature growth range", + VALIDATION: [ + {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', + ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"} + ] + }, + { + FIELD: "Recommended growth temperature", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD30"}, + {TYPE: MISSING, ERROR_CODE: "STD31"}, + {TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?', + ERROR_CODE: "STD32", + MULTIPLE: True, SEPARATOR: ";"} + ] + }, + { + FIELD: "Recommended medium for growth", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD33"}, + {TYPE: MISSING, ERROR_CODE: "STD34"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Growth media", + MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"} + ] + }, + { + FIELD: "Form of supply", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "STD36"}, + {TYPE: MISSING, ERROR_CODE: "STD37"}, + {TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium', + 'Lyo', 'Oil', 'Water'], + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"} + ] + }, + { + FIELD: "Other denomination", + }, + { + FIELD: "Coordinates of geographic origin", + VALIDATION: [ + {TYPE: COORDINATES, ERROR_CODE: "STD39"}, + ] + }, + { + FIELD: "Altitude of geographic origin", + VALIDATION: [ + {TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"}, + ] + }, + { + # value can be in the cell or in another sheet. Don't configure this + FIELD: "Geographic origin", + }, + { + FIELD: "Isolation habitat", + }, + { + FIELD: "Ontobiotope term for the isolation habitat", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope", + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"} + ] + }, + { + FIELD: "GMO", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["1", "2"], + ERROR_CODE: "STD42"} + ] + }, + { + FIELD: "GMO construction information", + }, + { + FIELD: "Mutant information", + }, + { + FIELD: "Genotype", + }, + { + FIELD: "Sexual state", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET, + ERROR_CODE: "STD43"} + # {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha", + # "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha", + # "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"], + # ERROR_CODE: "STD43"} + ] + }, + { + FIELD: "Ploidy", + VALIDATION: [ + {TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"], + ERROR_CODE: "STD44"} + ] + }, + { + FIELD: "Plasmids", + }, + { + FIELD: "Plasmids collections fields", + }, + { + # value can be in the cell or in another sheet. Don't configure this + FIELD: "Literature", + VALIDATION: [ + {TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET, + MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"} + ] + }, + { + FIELD: "Plant pathogenicity code", + }, + { + FIELD: "Pathogenicity", + }, + { + FIELD: "Enzyme production", + }, + { + FIELD: "Production of metabolites", + }, + { + FIELD: "Applications", + }, + { + FIELD: "Remarks" + }, + { + FIELD: "Literature linked to the sequence/genome", + }, +] +SHEETS_SCHEMA = { + LOCATIONS: { + "acronym": "GOD", + "id_field": "ID", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"}, + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD01"}, + {TYPE: MISSING, ERROR_CODE: "GOD02"}, + ] + }, + { + FIELD: "Country", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD03"}, + {TYPE: MISSING, ERROR_CODE: "GOD04"} + ] + }, + { + FIELD: "Region", + VALIDATION: [] + }, + { + FIELD: "City", + VALIDATION: [] + }, + { + FIELD: "Locality", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GOD06"}, + {TYPE: MISSING, ERROR_CODE: "GOD07"} + ] + } + ], + }, + GROWTH_MEDIA: { + "acronym": "GMD", + "id_field": "Acronym", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"}, + COLUMNS: [ + { + FIELD: "Acronym", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GMD01"}, + {TYPE: MISSING, ERROR_CODE: "GMD02"} + ] + }, + { + FIELD: "Description", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GMD03"}, + {TYPE: MISSING, ERROR_CODE: "GMD04"} + ] + }, + { + FIELD: "Full description", + VALIDATION: [] + }, + ], + }, + GENOMIC_INFO: { + "acronym": "GID", + "id_field": "Strain AN", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"}, + COLUMNS: [ + { + FIELD: "Strain AN", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID01"}, + {TYPE: MISSING, ERROR_CODE: "GID02"}, + {TYPE: CROSSREF, CROSSREF_NAME: "Strains", + ERROR_CODE: "GID03"}, + ] + }, + { + FIELD: "Marker", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID04"}, + {TYPE: MISSING, ERROR_CODE: "GID05"}, + {TYPE: CHOICES, ERROR_CODE: "GID06", + VALUES: ['16S rRNA', 'ACT', 'CaM', 'EF-1α', 'ITS', + 'LSU', 'RPB1', 'RPB2', 'TUBB']} + ] + }, + { + FIELD: "INSDC AN", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "GID07"}, + {TYPE: MISSING, ERROR_CODE: "GID08"}, + ] + }, + { + FIELD: "Sequence", + VALIDATION: [] + }, + ], + }, + STRAINS: { + "acronym": "STD", + 'id_field': 'Accession number', + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"}, + ROW_VALIDATION: [ + {TYPE: NAGOYA, ERROR_CODE: "STRXXX"}, + ], + COLUMNS: STRAIN_FIELDS, + }, + LITERATURE_SHEET: { + "acronym": "LID", + 'id_field': 'ID', + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"}, + ROW_VALIDATION: [ + {TYPE: BIBLIO, ERROR_CODE: 'LID17'} + ], + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID01"}, + {TYPE: MISSING, ERROR_CODE: "LID02"}, + ] + }, + { + FIELD: "Full reference", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID03"}, + ] + }, + { + FIELD: "Authors", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID05"}, + ] + }, + { + FIELD: "Title", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID07"}, + ] + }, + { + FIELD: "Journal", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID09"}, + ] + }, + { + FIELD: "Year", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID11"}, + ] + }, + { + FIELD: "Volume", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID13"}, + ] + }, + { + FIELD: "Issue", + VALIDATION: [] + }, + { + FIELD: "First page", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "LID15"}, + {TYPE: MISSING, ERROR_CODE: "LID16"}, + ] + }, + { + FIELD: "Last page", + VALIDATION: [] + }, + { + FIELD: "Book title", + VALIDATION: [] + }, + { + FIELD: "Editors", + VALIDATION: [] + }, + { + FIELD: "Publisher", + VALIDATION: [] + } + ], + }, + # SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []}, + # RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []}, + # FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []}, + # PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []}, + ONTOBIOTOPE: { + "acronym": "OTD", + "id_field": "ID", + VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"}, + COLUMNS: [ + { + FIELD: "ID", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "OTD01"}, + {TYPE: MISSING, ERROR_CODE: "OTD02"}, + ] + }, + { + FIELD: "Name", + VALIDATION: [ + {TYPE: MANDATORY, ERROR_CODE: "OTD03"}, + {TYPE: MISSING, ERROR_CODE: "OTD04"}, + ] + }, + ] + }, + # MARKERS: { + # "acronym": "MKD", + # "id_field": "", + # COLUMNS: [ + # { + # FIELD: "Acronym", + # VALIDATION: [] + # }, + # { + # FIELD: "Marker", + # VALIDATION: [] + # }, + # ], + # }, +} + +CROSS_REF_CONF = { + ONTOBIOTOPE: ['ID', 'Name'], + LITERATURE_SHEET: ['ID'], + LOCATIONS: ['Locality'], + GROWTH_MEDIA: ['Acronym'], + STRAINS: ["Accession number"], + SEXUAL_STATE_SHEET: [] + +} + +MIRRI_20200601_VALLIDATION_CONF = { + 'sheet_schema': SHEETS_SCHEMA, + 'cross_ref_conf': CROSS_REF_CONF, + 'keep_sheets_in_memory': [ + {'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}] +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..4d91e63 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +openpyxl +requests +requests_oauthlib +pycountry +deepdiff diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..316631c --- /dev/null +++ b/setup.py @@ -0,0 +1,35 @@ +import setuptools +from pathlib import Path +from setuptools import find_packages + +with open("README.md", "r") as fh: + long_description = fh.read() + +requirements = [line.strip() for line in open('requirements.txt')] +scripts = [str(f) for f in Path('./bin').glob('*.py')] + +setuptools.setup( + name="Mirri utils", # Replace with your own username + version=0.1, + author="P.Ziarsolo", + author_email="pziarsolo@gmail.com", + description="A small library to help dealing with MIRRI data", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/pziarsolo/mirri_utils", + packages=find_packages(), + package_data={"mirri": ['data/ontobiotopes.csv']}, + # package_dir={"mirri.entities": "mirri.entities" + # "mirri.io.parsers": "mirri.io.parsers", + # "mirri.io.writers": "mirri.io.writers", + # 'mirri.validation': 'mirri.vallidation'}, + install_requires=requirements, + scripts=scripts, + license="GNU General Public License v3.0", + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + python_requires='>=3.6', +) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/biolomics/__init__.py b/tests/biolomics/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/tests/biolomics/test_auth_operations.py b/tests/biolomics/test_auth_operations.py new file mode 100644 index 0000000..ec21098 --- /dev/null +++ b/tests/biolomics/test_auth_operations.py @@ -0,0 +1,22 @@ +import unittest + +from mirri.biolomics.remote.rest_client import BiolomicsClient +try: + from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +except ImportError: + raise ImportError( + 'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD') + +from .utils import VERSION, SERVER_URL + + +class BiolomicsClientAuthTest(unittest.TestCase): + + def test_authentication(self): + client = BiolomicsClient(SERVER_URL, VERSION, CLIENT_ID, SECRET_ID, + USERNAME, PASSWORD) + access1 = client.get_access_token() + access2 = client.get_access_token() + assert access1 is not None + self.assertEqual(access1, access2) + diff --git a/tests/biolomics/test_growth_medium_operations.py b/tests/biolomics/test_growth_medium_operations.py new file mode 100644 index 0000000..12de1a6 --- /dev/null +++ b/tests/biolomics/test_growth_medium_operations.py @@ -0,0 +1,62 @@ +import unittest + +from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS +from mirri.biolomics.serializers.growth_media import GrowthMedium +from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from tests.biolomics.utils import SERVER_URL, VERSION + + +class BiolomicsSequenceClientTest(unittest.TestCase): + def setUp(self): + self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + + def test_retrieve_media_by_id(self): + record_id = 101 + growth_medium = self.client.retrieve_by_id('growth_medium', record_id) + self.assertEqual(growth_medium.record_id, record_id) + + self.assertEqual(growth_medium.record_name, 'MA2PH6') + + def test_retrieve_media_by_id(self): + record_name = 'MA2PH6' + record_id = 101 + growth_medium = self.client.retrieve_by_name('growth_medium', record_name) + self.assertEqual(growth_medium.record_id, record_id) + self.assertEqual(growth_medium.record_name, record_name) + + def test_create_growth_media(self): + self.client.start_transaction() + try: + growth_medium = GrowthMedium() + growth_medium.acronym = 'BBB' + growth_medium.ingredients = 'alkhdflakhf' + growth_medium.description = 'desc' + + new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium) + print(new_growth_medium.dict()) + finally: + self.client.rollback() + + def test_update_growth_media(self): + self.client.start_transaction() + try: + growth_medium = GrowthMedium() + growth_medium.acronym = 'BBB' + growth_medium.ingredients = 'alkhdflakhf' + growth_medium.description = 'desc' + growth_medium.full_description = 'full' + new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium) + + new_growth_medium.full_description = 'full2' + updated_gm = new_growth_medium = self.client.update(GROWTH_MEDIUM_WS, new_growth_medium) + self.assertEqual(updated_gm.full_description, new_growth_medium.full_description) + + retrieved = self.client.retrieve_by_id(GROWTH_MEDIUM_WS, new_growth_medium.record_id) + self.assertEqual(retrieved.full_description, updated_gm.full_description) + + finally: + self.client.rollback() + + diff --git a/tests/biolomics/test_literature_operations.py b/tests/biolomics/test_literature_operations.py new file mode 100644 index 0000000..196d17d --- /dev/null +++ b/tests/biolomics/test_literature_operations.py @@ -0,0 +1,46 @@ +import unittest + +from .utils import VERSION, SERVER_URL +from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS +from mirri.entities.publication import Publication + + +class BiolomicsLiteratureClientTest(unittest.TestCase): + def setUp(self): + self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + + def test_retrieve_biblio_by_id(self): + record_id = 100 + record_name = "Miscellaneous notes on Mucoraceae" + biblio = self.client.retrieve_by_id(BIBLIOGRAPHY_WS, record_id) + self.assertEqual(biblio.record_id, record_id) + + self.assertEqual(biblio.record_name, record_name) + + def test_retrieve_media_by_id(self): + record_id = 100 + record_name = "Miscellaneous notes on Mucoraceae" + biblio = self.client.retrieve_by_name(BIBLIOGRAPHY_WS, record_name) + self.assertEqual(biblio.record_id, record_id) + self.assertEqual(biblio.record_name, record_name) + self.assertEqual(biblio.year, 1994) + self.assertEqual(biblio.volume, '50') + + def test_create_biblio(self): + pub = Publication() + pub.pubmed_id = 'PM18192' + pub.journal = 'my_journal' + pub.title = 'awesome title' + pub.authors = 'pasdas, aposjdasd, alsalsfda' + pub.volume = 'volume 0' + record_id = None + try: + new_pub = self.client.create(BIBLIOGRAPHY_WS, pub) + record_id = new_pub.record_id + self.assertEqual(new_pub.title, pub.title) + self.assertEqual(new_pub.volume, pub.volume) + finally: + if record_id is not None: + self.client.delete_by_id(BIBLIOGRAPHY_WS, record_id) diff --git a/tests/biolomics/test_sequence_operations.py b/tests/biolomics/test_sequence_operations.py new file mode 100644 index 0000000..ddf8864 --- /dev/null +++ b/tests/biolomics/test_sequence_operations.py @@ -0,0 +1,49 @@ +import unittest + +from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics +from .utils import VERSION, SERVER_URL + + +class BiolomicsSequenceClientTest(unittest.TestCase): + def setUp(self) -> None: + self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + + def test_retrieve_seq_by_id(self): + record_id = 101 + sequence = self.client.retrieve_by_id('sequence', record_id) + + self.assertEqual(sequence.record_id, record_id) + self.assertEqual(sequence.record_name, 'MUM 02.54 - CaM') + self.assertEqual(sequence.marker_type, 'CaM') + + def test_retrieve_seq_by_name(self): + record_name = 'MUM 02.54 - CaM' + sequence = self.client.retrieve_by_name('sequence', record_name) + + self.assertEqual(sequence.record_id, 101) + self.assertEqual(sequence.record_name, record_name) + self.assertEqual(sequence.marker_type, 'CaM') + + def test_create_delete_sequence(self): + marker = GenomicSequenceBiolomics() + marker.marker_id = 'GGAAUUA' + marker.marker_seq = 'aattgacgat' + marker.marker_type = 'CaM' + marker.record_name = 'peioMarker' + + new_marker = self.client.create('sequence', marker) + self.assertEqual(new_marker.marker_id, 'GGAAUUA') + self.assertEqual(new_marker.marker_seq, 'aattgacgat') + self.assertEqual(new_marker.marker_type, 'CaM') + self.assertEqual(new_marker.record_name, 'peioMarker') + self.assertTrue(new_marker.record_id) + + self.client.delete_by_id('sequence', new_marker.record_id) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', 'BiolomicsClient.Test.test_get_strain_by_id'] + unittest.main() diff --git a/tests/biolomics/test_serializers.py b/tests/biolomics/test_serializers.py new file mode 100644 index 0000000..e57d1f3 --- /dev/null +++ b/tests/biolomics/test_serializers.py @@ -0,0 +1,727 @@ +import unittest +import pycountry +import deepdiff +from pprint import pprint +from mirri.biolomics.serializers.sequence import ( + GenomicSequenceBiolomics, + serialize_to_biolomics as sequence_to_biolomics, + serialize_from_biolomics as sequence_from_biolomics) + +from mirri.biolomics.serializers.strain import ( + serialize_to_biolomics as strain_to_biolomics, + serialize_from_biolomics as strain_from_biolomics) +from mirri.biolomics.serializers.growth_media import ( + # serialize_to_biolomics as growth_medium_to_biolomics, + serialize_from_biolomics as growth_medium_from_biolomics) +from mirri.biolomics.serializers.bibliography import ( + serializer_from_biolomics as literature_from_biolomics, + serializer_to_biolomics as literature_to_biolomics +) +from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.entities.publication import Publication +from .utils import create_full_data_strain, VERSION, SERVER_URL + + +STRAIN_WS = { + 'CreationDate': '2021-05-19T12:22:33', + 'CreatorUserName': 'pziarsolo@cect.org', + 'LastChangeDate': '2021-05-19T12:22:36', + 'LastChangeUserName': 'pziarsolo@cect.org', + 'RecordDetails': {'ABS related files': {'FieldType': 21, + 'Value': [{'Name': 'link', + 'Value': 'https://example.com'}]}, + 'Altitude of geographic origin': {'FieldType': 4, + 'Value': 121.0}, + 'Applications': {'FieldType': 5, 'Value': 'health'}, + 'Catalog URL': {'FieldType': 21, 'Value': []}, + 'Collection accession number': {'FieldType': 5, + 'Value': 'TESTCC 1'}, + 'Collection date': {'FieldType': 8, 'Value': '1991/01/01'}, + 'Collector': {'FieldType': 5, 'Value': 'the collector'}, + 'Comment on taxonomy': {'FieldType': 5, + 'Value': 'lalalalla'}, + 'Coordinates of geographic origin': {'FieldType': 12, + 'Value': {'Altitude': 0.0, + 'Latitude': 23.3, + 'Longitude': 23.3, + 'Precision': 0.0}}, + 'Country': {'FieldType': 118, + 'Value': [{'Name': {'FieldType': 5, + 'Value': 'Spain'}, + 'RecordId': 54, + 'TargetFieldValue': None}]}, + 'Data provided by': {'FieldType': 22, 'Value': 'Unknown'}, + 'Date of inclusion in the catalogue': {'FieldType': 8, + 'Value': '1985/05/02'}, + 'Deposit date': {'FieldType': 8, 'Value': '1985/05/02'}, + 'Depositor': {'FieldType': 5, + 'Value': 'NCTC, National Collection of Type ' + 'Cultures - NCTC, London, United ' + 'Kingdom of Great Britain and ' + 'Northern Ireland.'}, + 'Dual use': {'FieldType': 20, 'Value': 'yes'}, + 'Enzyme production': {'FieldType': 5, + 'Value': 'some enzimes'}, + 'Form': {'FieldType': 3, + 'Value': [{'Name': 'Agar', 'Value': 'yes'}, + {'Name': 'Cryo', 'Value': 'no'}, + {'Name': 'Dry Ice', 'Value': 'no'}, + {'Name': 'Liquid Culture Medium', + 'Value': 'no'}, + {'Name': 'Lyo', 'Value': 'yes'}, + {'Name': 'Oil', 'Value': 'no'}, + {'Name': 'Water', 'Value': 'no'}]}, + 'GMO': {'FieldType': 22, 'Value': 'Yes'}, + 'GMO construction information': {'FieldType': 5, + 'Value': 'instructrion to ' + 'build'}, + 'Genotype': {'FieldType': 5, 'Value': 'some genotupe'}, + 'Geographic origin': {'FieldType': 5, + 'Value': 'una state; one ' + 'municipality; somewhere in ' + 'the world'}, + 'History': {'FieldType': 5, + 'Value': 'newer < In the middle < older'}, + 'Infrasubspecific names': {'FieldType': 5, + 'Value': 'serovar tete'}, + 'Interspecific hybrid': {'FieldType': 20, 'Value': 'no'}, + 'Isolation date': {'FieldType': 8, 'Value': '1900/01/01'}, + 'Isolation habitat': {'FieldType': 5, + 'Value': 'some habitat'}, + 'Isolator': {'FieldType': 5, 'Value': 'the isolator'}, + 'Literature': {'FieldType': 118, 'Value': []}, + 'MTA files URL': {'FieldType': 21, + 'Value': [{'Name': 'link', + 'Value': 'https://example.com'}]}, + 'MTA text': {'FieldType': 5, 'Value': ''}, + 'Metabolites production': {'FieldType': 5, + 'Value': 'big factory of cheese'}, + 'Mutant information': {'FieldType': 5, 'Value': 'x-men'}, + 'Nagoya protocol restrictions and compliance conditions': {'FieldType': 20, + 'Value': 'no ' + 'known ' + 'restrictions ' + 'under ' + 'the ' + 'Nagoya ' + 'protocol'}, + 'Ontobiotope': {'FieldType': 118, + 'Value': [{'Name': {'FieldType': 5, + 'Value': 'anaerobic ' + 'bioreactor ' + '(OBT:000190)'}, + 'RecordId': 100, + 'TargetFieldValue': None}]}, + 'Ontobiotope term for the isolation habitat': {'FieldType': 5, + 'Value': ''}, + 'Orders': {'FieldType': 118, 'Value': []}, + 'Organism type': {'FieldType': 3, + 'Value': [{'Name': 'Algae', 'Value': 'no'}, + {'Name': 'Archaea', + 'Value': 'yes'}, + {'Name': 'Bacteria', + 'Value': 'no'}, + {'Name': 'Cyanobacteria', + 'Value': 'no'}, + {'Name': 'Filamentous Fungi', + 'Value': 'no'}, + {'Name': 'Phage', 'Value': 'no'}, + {'Name': 'Plasmid', + 'Value': 'no'}, + {'Name': 'Virus', 'Value': 'no'}, + {'Name': 'Yeast', 'Value': 'no'}, + {'Name': 'Microalgae', + 'Value': '?'}]}, + 'Other culture collection numbers': {'FieldType': 5, + 'Value': 'aaa a; aaa3 ' + 'a3'}, + 'Other denomination': {'FieldType': 5, 'Value': ''}, + 'Pathogenicity': {'FieldType': 5, 'Value': 'illness'}, + 'Plasmids': {'FieldType': 5, 'Value': 'asda'}, + 'Plasmids collections fields': {'FieldType': 5, + 'Value': 'asdasda'}, + 'Ploidy': {'FieldType': 20, 'Value': 'Polyploid'}, + 'Quarantine in Europe': {'FieldType': 20, 'Value': 'no'}, + 'Recommended growth medium': {'FieldType': 118, + 'Value': [{'Name': {'FieldType': 5, + 'Value': 'AAA'}, + 'RecordId': 1, + 'TargetFieldValue': None}]}, + 'Recommended growth temperature': {'FieldType': 19, + 'MaxValue': 30.0, + 'MinValue': 30.0}, + 'Remarks': {'FieldType': 5, 'Value': 'no remarks for me'}, + 'Restrictions on use': {'FieldType': 20, + 'Value': 'no restriction apply'}, + 'Risk group': {'FieldType': 20, 'Value': '1'}, + 'Sequences 16s': {"Value": [ + { + "Name": { + "Value": "X76436", + "FieldType": 5 + }, + "RecordId": 50992, + "TargetFieldValue": { + "Value": { + "Sequence": "" + }, + "FieldType": 14 + } + } + ], + "FieldType": 114}, + 'Sequences 18S rRNA': {'FieldType': 114, 'Value': []}, + 'Sequences 23S rRNA': {'FieldType': 114, 'Value': []}, + 'Sequences ACT': {'FieldType': 114, 'Value': []}, + 'Sequences AmdS': {'FieldType': 114, 'Value': []}, + 'Sequences Amds12': {'FieldType': 114, 'Value': []}, + 'Sequences Beta tubulin': {'FieldType': 114, 'Value': []}, + 'Sequences COX1': {'FieldType': 114, 'Value': []}, + 'Sequences COX2': {'FieldType': 114, 'Value': []}, + 'Sequences CaM': {'FieldType': 114, 'Value': []}, + 'Sequences Cct8': {'FieldType': 114, 'Value': []}, + 'Sequences Cit1': {'FieldType': 114, 'Value': []}, + 'Sequences CypA': {'FieldType': 114, 'Value': []}, + 'Sequences GDP': {'FieldType': 114, 'Value': []}, + 'Sequences GPD': {'FieldType': 114, 'Value': []}, + 'Sequences Genome': {'FieldType': 114, 'Value': []}, + 'Sequences HIS': {'FieldType': 114, 'Value': []}, + 'Sequences HSP': {'FieldType': 114, 'Value': []}, + 'Sequences IDH': {'FieldType': 114, 'Value': []}, + 'Sequences IGS': {'FieldType': 114, 'Value': []}, + 'Sequences ITS': {'FieldType': 114, 'Value': []}, + 'Sequences LSU': {'FieldType': 114, 'Value': []}, + 'Sequences MAT': {'FieldType': 114, 'Value': []}, + 'Sequences MAT1': {'FieldType': 114, 'Value': []}, + 'Sequences Miscellaneous': {'FieldType': 114, 'Value': []}, + 'Sequences NorA': {'FieldType': 114, 'Value': []}, + 'Sequences NorB': {'FieldType': 114, 'Value': []}, + 'Sequences Omt12': {'FieldType': 114, 'Value': []}, + 'Sequences OmtA': {'FieldType': 114, 'Value': []}, + 'Sequences PcCYP': {'FieldType': 114, 'Value': []}, + 'Sequences PpgA': {'FieldType': 114, 'Value': []}, + 'Sequences PreA': {'FieldType': 114, 'Value': []}, + 'Sequences PreB': {'FieldType': 114, 'Value': []}, + 'Sequences RAPD': {'FieldType': 114, 'Value': []}, + 'Sequences RPB1': {'FieldType': 114, 'Value': []}, + 'Sequences RPB2': {'FieldType': 114, 'Value': []}, + 'Sequences SSU': {'FieldType': 114, 'Value': []}, + 'Sequences TEF1a': {'FieldType': 114, 'Value': []}, + 'Sequences TEF2': {'FieldType': 114, 'Value': []}, + 'Sequences TUB': {'FieldType': 114, 'Value': []}, + 'Sequences Tsr1': {'FieldType': 114, 'Value': []}, + 'Sequences c16S rRNA': {'FieldType': 114, 'Value': []}, + 'Sequences cbhI': {'FieldType': 114, 'Value': []}, + 'Sequences mcm7': {'FieldType': 114, 'Value': []}, + 'Sequences rbcL': {'FieldType': 114, 'Value': []}, + 'Sexual state': {'FieldType': 5, 'Value': 'MT+A'}, + 'Status': {'FieldType': 5, + 'Value': 'type of Bacillus alcalophilus'}, + 'Strain from a registered collection': {'FieldType': 20, + 'Value': 'no'}, + 'Substrate of isolation': {'FieldType': 5, + 'Value': 'some substrate'}, + 'Taxon name': {'FieldType': 109, + 'Value': [{'Name': {'FieldType': 5, + 'Value': 'Escherichia ' + 'coli'}, + 'RecordId': 100004123, + 'TargetFieldValue': {'DesktopInfo': None, + 'DesktopInfoHtml': 'Current ' + 'name: ' + 'Escherichia ' + 'coli ' + '(Migula ' + '1895) ' + 'Castellani ' + 'and ' + 'Chalmers ' + '1919', + 'FieldType': 27, + 'NewSynFieldInfo': None, + 'ObligateSynonymId': 0, + 'OriginalSynFieldInfo': None, + 'SynInfo': {'BasionymRecord': {'NameInfo': '', + 'RecordId': 100004123, + 'RecordName': 'Escherichia ' + 'coli ' + '(Migula ' + '1895) ' + 'Castellani ' + 'and ' + 'Chalmers ' + '1919', + 'SecondLevelRecords': None}, + 'CurrentNameRecord': {'NameInfo': '', + 'RecordId': 100004123, + 'RecordName': 'Escherichia ' + 'coli ' + '(Migula ' + '1895) ' + 'Castellani ' + 'and ' + 'Chalmers ' + '1919', + 'SecondLevelRecords': None}, + 'ObligateSynonymRecords': [], + 'SelectedRecord': { + 'NameInfo': 'Escherichia ' + 'coli ' + '(Migula ' + '1895) ' + 'Castellani ' + 'and ' + 'Chalmers ' + '1919', + 'RecordId': 100004123, + 'RecordName': 'Escherichia ' + 'coli ' + '(Migula ' + '1895) ' + 'Castellani ' + 'and ' + 'Chalmers ' + '1919', + 'SecondLevelRecords': None}, + 'TaxonSynonymsRecords': []}, + 'SynonymId': 100004123}}]}, + 'Tested temperature growth range': {'FieldType': 19, + 'MaxValue': 32.0, + 'MinValue': 29.0}, + 'Type description': {'FieldType': 5, 'Value': ''}}, + 'RecordId': 148038, + 'RecordName': 'MIRRI 2240561'} + +STRAIN_WS_EXPECTED_NO_REMOTE = { + 'Acronym': 'MIRRI', + 'RecordDetails': {'ABS related files': {'FieldType': 'U', + 'Value': [{'Name': 'link', + 'Value': 'https://example.com'}]}, + 'Altitude of geographic origin': {'FieldType': 'D', + 'Value': 121}, + 'Applications': {'FieldType': 'E', 'Value': 'health'}, + 'Collection accession number': {'FieldType': 'E', + 'Value': 'TESTCC 1'}, + 'Collection date': {'FieldType': 'H', 'Value': '1991-01-01'}, + 'Collector': {'FieldType': 'E', 'Value': 'the collector'}, + 'Comment on taxonomy': {'FieldType': 'E', + 'Value': 'lalalalla'}, + 'Coordinates of geographic origin': {'FieldType': 'L', + 'Value': {'Latitude': 23.3, + 'Longitude': 23.3}}, + 'Date of inclusion in the catalogue': {'FieldType': 'H', + 'Value': '1985-05-02'}, + 'Deposit date': {'FieldType': 'H', 'Value': '1985-05-02'}, + 'Depositor': {'FieldType': 'E', + 'Value': 'NCTC, National Collection of Type ' + 'Cultures - NCTC, London, United ' + 'Kingdom of Great Britain and ' + 'Northern Ireland.'}, + 'Dual use': {'FieldType': 'T', 'Value': 'yes'}, + 'Enzyme production': {'FieldType': 'E', + 'Value': 'some enzimes'}, + 'Form': {'FieldType': 'C', + 'Value': [{'Name': 'Agar', 'Value': 'yes'}, + {'Name': 'Cryo', 'Value': 'no'}, + {'Name': 'Dry Ice', 'Value': 'no'}, + {'Name': 'Liquid Culture Medium', + 'Value': 'no'}, + {'Name': 'Lyo', 'Value': 'yes'}, + {'Name': 'Oil', 'Value': 'no'}, + {'Name': 'Water', 'Value': 'no'}]}, + 'GMO': {'FieldType': 'V', 'Value': 'Yes'}, + 'GMO construction information': {'FieldType': 'E', + 'Value': 'instructrion to ' + 'build'}, + 'Genotype': {'FieldType': 'E', 'Value': 'some genotupe'}, + 'Geographic origin': {'FieldType': 'E', + 'Value': 'una state; one ' + 'municipality; somewhere in ' + 'the world'}, + 'History': {'FieldType': 'E', + 'Value': 'firstplave < seconn place < third ' + 'place'}, + 'Infrasubspecific names': {'FieldType': 'E', + 'Value': 'serovar tete'}, + 'Interspecific hybrid': {'FieldType': 'T', 'Value': 'no'}, + 'Isolation date': {'FieldType': 'H', 'Value': '1900-01-01'}, + 'Isolation habitat': {'FieldType': 'E', + 'Value': 'some habitat'}, + 'Isolator': {'FieldType': 'E', 'Value': 'the isolator'}, + 'MTA files URL': {'FieldType': 'U', + 'Value': [{'Name': 'link', + 'Value': 'https://example.com'}]}, + 'Metabolites production': {'FieldType': 'E', + 'Value': 'big factory of cheese'}, + 'Mutant information': {'FieldType': 'E', 'Value': 'x-men'}, + 'Nagoya protocol restrictions and compliance conditions': {'FieldType': 'T', + 'Value': 'no ' + 'known ' + 'restrictions ' + 'under ' + 'the ' + 'Nagoya ' + 'protocol'}, + 'Ontobiotope': {'FieldType': 'RLink', 'Value': 'OBT:000190'}, + 'Organism type': {'FieldType': 'C', + 'Value': [{'Name': 'Algae', 'Value': 'no'}, + {'Name': 'Archaea', + 'Value': 'yes'}, + {'Name': 'Bacteria', + 'Value': 'no'}, + {'Name': 'Cyanobacteria', + 'Value': 'no'}, + {'Name': 'Filamentous Fungi', + 'Value': 'no'}, + {'Name': 'Phage', 'Value': 'no'}, + {'Name': 'Plasmid', + 'Value': 'no'}, + {'Name': 'Virus', 'Value': 'no'}, + {'Name': 'Yeast', + 'Value': 'no'}]}, + 'Other culture collection numbers': {'FieldType': 'E', + 'Value': 'aaa a; aaa3 ' + 'a3'}, + 'Pathogenicity': {'FieldType': 'E', 'Value': 'illness'}, + 'Plasmids': {'FieldType': 'E', 'Value': 'asda'}, + 'Plasmids collections fields': {'FieldType': 'E', + 'Value': 'asdasda'}, + 'Ploidy': {'FieldType': 'T', 'Value': 'Polyploid'}, + 'Quarantine in Europe': {'FieldType': 'T', 'Value': 'no'}, + 'Recommended growth temperature': {'FieldType': 'S', + 'MaxValue': 30.0, + 'MinValue': 30.0}, + 'Remarks': {'FieldType': 'E', 'Value': 'no remarks for me'}, + 'Restrictions on use': {'FieldType': 'T', + 'Value': 'no restriction apply'}, + 'Risk group': {'FieldType': 'T', 'Value': '1'}, + 'Sexual state': {'FieldType': 'E', 'Value': 'MT+A'}, + 'Status': {'FieldType': 'E', + 'Value': 'type of Bacillus alcalophilus'}, + 'Strain from a registered collection': {'FieldType': 'T', + 'Value': 'no'}, + 'Substrate of isolation': {'FieldType': 'E', + 'Value': 'some substrate'}, + 'Taxon name': {'FieldType': 'SynLink', + 'Value': 'Escherichia coli'}, + 'Tested temperature growth range': {'FieldType': 'S', + 'MaxValue': 32.0, + 'MinValue': 29.0}}} + + +class StrainSerializerTest(unittest.TestCase): + + def test_serialize_to_biolomics(self): + strain = create_full_data_strain() + ws_strain = strain_to_biolomics(strain, client=None) + self.assertDictEqual(ws_strain, STRAIN_WS_EXPECTED_NO_REMOTE) + + def test_serialize_to_biolomics_remote(self): + client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + strain = create_full_data_strain() + marker = GenomicSequenceBiolomics() + marker.marker_id = "MUM 02.15 - Beta tubulin" + marker.marker_type = 'TUBB' + strain.genetics.markers = [marker] + ws_strain = strain_to_biolomics(strain, client=client) + + self.assertEqual(strain.collect.habitat_ontobiotope, + ws_strain['RecordDetails']['Ontobiotope']['Value'][0]['Name']['Value']) + self.assertEqual(pycountry.countries.get(alpha_3=strain.collect.location.country).name, + ws_strain['RecordDetails']['Country']['Value'][0]['Name']['Value']) + self.assertEqual(strain.publications[0].title, + ws_strain['RecordDetails']['Literature']['Value'][0]['Name']['Value']) + self.assertEqual(strain.genetics.markers[0].marker_id, + ws_strain['RecordDetails']['Sequences TUB']['Value'][0]['Name']['Value']) + + def test_serialize_from_biolomics(self): + ws_strain = STRAIN_WS + strain = strain_from_biolomics(ws_strain) + self.assertEqual(strain.record_id, 148038) + self.assertEqual(strain.record_name, 'MIRRI 2240561') + self.assertEqual(strain.taxonomy.long_name, 'Escherichia coli') + self.assertEqual(strain.growth.recommended_media, ['AAA']) + self.assertEqual(strain.collect.location.altitude, 121) + self.assertEqual(strain.collect.location.country, 'ESP') + self.assertEqual(strain.applications, 'health') + self.assertEqual(strain.id.strain_id, 'TESTCC 1') + self.assertEqual(strain.collect.date.strfdate, '19910101') + self.assertEqual(strain.taxonomy.comments, 'lalalalla') + self.assertEqual(strain.catalog_inclusion_date.strfdate, '19850502') + self.assertIn('NCTC, National Collection of Type ', strain.deposit.who) + self.assertTrue(strain.is_potentially_harmful) + self.assertEqual(strain.form_of_supply, ['Agar', 'Lyo']) + self.assertTrue(strain.genetics.gmo) + self.assertEqual(strain.genetics.gmo_construction, 'instructrion to build') + self.assertEqual(strain.genetics.genotype, 'some genotupe') + self.assertEqual(strain.history, ['newer', 'In the middle', 'older']) + self.assertEqual(strain.taxonomy.infrasubspecific_name, 'serovar tete') + self.assertEqual(strain.isolation.who, 'the isolator') + self.assertEqual(strain.isolation.date.strfdate, '19000101') + self.assertEqual(strain.mta_files, ['https://example.com']) + self.assertEqual(strain.genetics.mutant_info, 'x-men') + self.assertEqual(strain.collect.habitat_ontobiotope, 'OBT:000190') + self.assertEqual(strain.taxonomy.organism_type[0].name, 'Archaea') + self.assertEqual(strain.other_numbers[0].strain_id, 'aaa a') + self.assertEqual(strain.other_numbers[1].strain_id, 'aaa3 a3') + self.assertEqual(strain.pathogenicity, 'illness') + self.assertEqual(strain.genetics.plasmids, ['asda']) + self.assertEqual(strain.genetics.ploidy, 9) + self.assertFalse(strain.is_subject_to_quarantine) + self.assertEqual(strain.risk_group, '1') + self.assertFalse(strain.is_from_registered_collection) + self.assertEqual(strain.growth.tested_temp_range, {'min': 29, 'max': 32}) + + +BIOLOMICSSEQ = { + 'RecordDetails': { + 'Barcode level': {'FieldType': 20, 'Value': 'undefined'}, + 'DNA extract number': {'FieldType': 5, 'Value': ''}, + 'DNA sequence': {'FieldType': 14, + 'Value': {'Sequence': 'caaaggaggccttctccctcttcgtaag'}}, + 'Editing state': {'FieldType': 20, 'Value': 'Auto import'}, + 'Forward primer(s)': {'FieldType': 5, 'Value': ''}, + 'Genbank': {'FieldType': 21, 'Value': []}, + 'INSDC number': {'FieldType': 5, 'Value': 'AATGAT'}, + 'Literature': {'FieldType': 21, 'Value': []}, + 'Literature1': {'FieldType': 118, 'Value': []}, + 'Marker name': {'FieldType': 5, 'Value': 'CaM'}, + 'Privacy': {'FieldType': 20, 'Value': 'undefined'}, + 'Quality': {'FieldType': 5, 'Value': ''}, + 'Remarks': {'FieldType': 5, 'Value': ''}, + 'Reverse primer(s)': {'FieldType': 5, 'Value': ''}, + 'Review state': {'FieldType': 5, 'Value': ''}, + 'Strain number': {'FieldType': 5, 'Value': 'MUM 02.54'}}, + 'RecordId': 101, + 'RecordName': 'MUM 02.54 - CaM'} + + +class SequenceSerializerTest(unittest.TestCase): + + def test_from_biolomics(self): + marker = sequence_from_biolomics(BIOLOMICSSEQ) + self.assertEqual(marker.record_name, BIOLOMICSSEQ['RecordName']) + self.assertEqual(marker.record_id, BIOLOMICSSEQ['RecordId']) + self.assertEqual(marker.marker_type, BIOLOMICSSEQ['RecordDetails']['Marker name']['Value']) + self.assertEqual(marker.marker_id, BIOLOMICSSEQ['RecordDetails']['INSDC number']['Value']) + self.assertEqual(marker.marker_seq, BIOLOMICSSEQ['RecordDetails']['DNA sequence']['Value']['Sequence']) + + def test_to_biolomics(self): + marker = GenomicSequenceBiolomics() + marker.marker_id = 'GGAAUUA' + marker.marker_seq = 'aattgacgat' + marker.marker_type = 'CaM' + marker.record_name = 'peioMarker' + marker.record_id = 111 + ws_seq = sequence_to_biolomics(marker) + expected = {'RecordId': marker.record_id, + 'RecordName': marker.record_name, + 'RecordDetails': { + 'INSDC number': {'Value': marker.marker_id, 'FieldType': 'E'}, + 'DNA sequence': {'Value': {'Sequence': marker.marker_seq}, 'FieldType': 'N'}, + 'Marker name': {'Value': marker.marker_type, 'FieldType': 'E'}}} + + self.assertEqual(ws_seq, expected) + + +BIOLOMICS_MEDIUM = { + "RecordId": 100, + "RecordName": "MA20S", + "RecordDetails": { + "Full description": { + "Value": "mout agar+20% saccharose", + "FieldType": 5 + }, + "Ingredients": { + "Value": "Malt extract\r\n\tDilute brewery malt with water to 10% sugar solution (level 10 on Brix saccharose meter), 15 minutes at 121 C\r\nsaccharose\t200g\r\ndistilled water\t0.6l\r\nagar\t15g\r\n", + "FieldType": 5 + }, + "Link to full description": { + "Value": [], + "FieldType": 21 + }, + "Medium description": { + "Value": "", + "FieldType": 5 + }, + "Other name": { + "Value": "", + "FieldType": 5 + }, + "pH": { + "Value": "7 with KOH", + "FieldType": 5 + }, + "Remarks": { + "Value": "", + "FieldType": 5 + }, + "Reference": { + "Value": "", + "FieldType": 5 + }, + "Sterilization conditions": { + "Value": "15 minutes at 121 C", + "FieldType": 5 + } + } +} + + +class MediumSerializerTest(unittest.TestCase): + def test_from_biolomics(self): + medium = growth_medium_from_biolomics(BIOLOMICS_MEDIUM) + self.assertEqual(medium.record_id, BIOLOMICS_MEDIUM['RecordId']) + self.assertEqual(medium.record_name, BIOLOMICS_MEDIUM['RecordName']) + self.assertEqual(medium.ingredients, BIOLOMICS_MEDIUM['RecordDetails']['Ingredients']['Value']) + self.assertEqual(medium.full_description, BIOLOMICS_MEDIUM['RecordDetails']['Full description']['Value']) + self.assertEqual(medium.ph, BIOLOMICS_MEDIUM['RecordDetails']['pH']['Value']) + + +BIOLOMICS_BIBLIOGRAPHY = { + "RecordId": 100, + "RecordName": "Miscellaneous notes on Mucoraceae", + "RecordDetails": { + "Associated strains": { + "Value": [], + "FieldType": 118 + }, + "Associated taxa": { + "Value": [], + "FieldType": 118 + }, + "Authors": { + "Value": "Schipper, M.A.A.; Samson, R.A.", + "FieldType": 5 + }, + "Associated sequences": { + "Value": [], + "FieldType": 118 + }, + "Abstract": { + "Value": "", + "FieldType": 5 + }, + "Collection": { + "Value": "", + "FieldType": 5 + }, + "DOI number": { + "Value": "", + "FieldType": 5 + }, + "Editor(s)": { + "Value": "", + "FieldType": 5 + }, + "Full reference": { + "Value": "", + "FieldType": 5 + }, + "Hyperlink": { + "Value": [], + "FieldType": 21 + }, + "ISBN": { + "Value": "", + "FieldType": 5 + }, + "ISSN": { + "Value": "", + "FieldType": 5 + }, + "Issue": { + "Value": "", + "FieldType": 5 + }, + "Journal": { + "Value": "Mycotaxon", + "FieldType": 5 + }, + "Journal-Book": { + "Value": "", + "FieldType": 5 + }, + "Keywords": { + "Value": "", + "FieldType": 5 + }, + "Page from": { + "Value": "475", + "FieldType": 5 + }, + "Page to": { + "Value": "491", + "FieldType": 5 + }, + "Publisher": { + "Value": "", + "FieldType": 5 + }, + "PubMed ID": { + "Value": "", + "FieldType": 5 + }, + "Volume": { + "Value": "50", + "FieldType": 5 + }, + "Year": { + "Value": 1994, + "FieldType": 4 + } + } +} + + +class BibliographySerializerTest(unittest.TestCase): + def test_from_biolomics(self): + pub = literature_from_biolomics(BIOLOMICS_BIBLIOGRAPHY) + self.assertEqual(pub.record_name, "Miscellaneous notes on Mucoraceae") + self.assertEqual(pub.record_id, 100) + self.assertEqual(pub.year, 1994) + self.assertEqual(pub.authors, "Schipper, M.A.A.; Samson, R.A.") + + def test_to_biolomics(self): + pub = Publication() + pub.title = 'My title' + pub.year = 1992 + pub.authors = 'me and myself' + pub.pubmed_id = '1112222' + pub.issue = 'issue' + ws_data = literature_to_biolomics(pub) + expected = { + 'RecordDetails': { + 'Authors': {'FieldType': 'E', 'Value': 'me and myself'}, + 'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}, + 'Issue': {'FieldType': 'E', 'Value': 'issue'}, + 'Year': {'FieldType': 'D', 'Value': 1992}}, + 'RecordName': 'My title'} + self.assertDictEqual(expected, ws_data) + + def test_to_biolomics2(self): + pub = Publication() + pub.pubmed_id = '1112222' + ws_data = literature_to_biolomics(pub) + expected = { + 'RecordDetails': { + 'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}}, + 'RecordName': f'PUBMED:{pub.pubmed_id}'} + self.assertDictEqual(expected, ws_data) + + pub = Publication() + pub.doi = 'doi.er/111/12131' + ws_data = literature_to_biolomics(pub) + expected = { + 'RecordDetails': { + 'DOI number': {'FieldType': 'E', 'Value': pub.doi}}, + 'RecordName': f'DOI:{pub.doi}'} + self.assertDictEqual(expected, ws_data) + + +if __name__ == "__main__": + import sys; + sys.argv = ['', 'BibliographySerializerTest'] + unittest.main() diff --git a/tests/biolomics/test_strain_operations.py b/tests/biolomics/test_strain_operations.py new file mode 100644 index 0000000..2bf9584 --- /dev/null +++ b/tests/biolomics/test_strain_operations.py @@ -0,0 +1,156 @@ +import unittest + +from mirri.biolomics.remote.endoint_names import STRAIN_WS +from .utils import VERSION, SERVER_URL, create_full_data_strain +from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD +from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient +from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number + + +class BiolomicsStrainClientTest(unittest.TestCase): + def setUp(self): + self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + + def test_retrieve_strain_by_id(self): + record_id = 14803 + strain = self.client.retrieve_by_id(STRAIN_WS, record_id) + self.assertEqual(strain.record_id, record_id) + print(strain.record_name) + + def test_retrieve_strain_by_name(self): + record_id = 14803 + record_name = 'MIRRI0014803' + strain = self.client.retrieve_by_name(STRAIN_WS, record_name) + self.assertEqual(strain.record_name, record_name) + self.assertEqual(strain.record_id, record_id) + + def test_search_strain(self): + accession_number = "BEA 0014B" + query = {"Query": [{"Index": 0, + "FieldName": "Collection accession number", + "Operation": "TextExactMatch", + "Value": accession_number}], + "Expression": "Q0", + "DisplayStart": 0, + "DisplayLength": 10} + + search_response = self.client.search(STRAIN_WS, query) + + self.assertEqual(search_response['total'], 1) + self.assertEqual(search_response['records'][0].id.strain_id, + accession_number) + + def test_search_strain4(self): + accession_number = "TESTCC 1" + query = {"Query": [{"Index": 0, + "FieldName": "Collection accession number", + "Operation": "TextExactMatch", + "Value": accession_number}], + "Expression": "Q0", + "DisplayStart": 0, + "DisplayLength": 10} + + search_response = self.client.search(STRAIN_WS, query) + for strain in search_response['records']: + print(strain) + self.client.delete_by_id(STRAIN_WS, strain.record_id) + + def test_search_strain_no_found(self): + accession_number = "BEA 0014B_" + query = {"Query": [{"Index": 0, + "FieldName": "Collection accession number", + "Operation": "TextExactMatch", + "Value": accession_number}], + "Expression": "Q0", + "DisplayStart": 0, + "DisplayLength": 10} + + search_response = self.client.search(STRAIN_WS, query) + + self.assertEqual(search_response['total'], 0) + self.assertFalse(search_response['records']) + + def test_create_strain(self): + strain = create_full_data_strain() + strain.taxonomy.interspecific_hybrid = None + record_id = None + try: + new_strain = self.client.create(STRAIN_WS, strain) + record_id = new_strain.record_id + self.assertIsNone(new_strain.taxonomy.interspecific_hybrid) + self.assertEqual(new_strain.growth.recommended_media, ['AAA']) + self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) + finally: + if record_id is not None: + self.client.delete_by_id(STRAIN_WS, record_id) + + def test_update_strain(self): + strain = create_full_data_strain() + record_id = None + try: + new_strain = self.client.create(STRAIN_WS, strain) + record_id = new_strain.record_id + self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) + self.assertFalse(new_strain.taxonomy.interspecific_hybrid) + new_strain.id.number = '2' + new_strain.taxonomy.interspecific_hybrid = None + updated_strain = self.client.update(STRAIN_WS, new_strain) + self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id) + self.assertIsNone(updated_strain.taxonomy.interspecific_hybrid) + + retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id) + self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id) + self.assertIsNone(retrieved_strain.taxonomy.interspecific_hybrid) + finally: + if record_id is not None: + print('deleting') + self.client.delete_by_id(STRAIN_WS, record_id) + + def test_update_strain_pathogenicity(self): + strain = create_full_data_strain() + print(strain.pathogenicity) + record_id = None + try: + new_strain = self.client.create(STRAIN_WS, strain) + record_id = new_strain.record_id + self.assertEqual(new_strain.id.strain_id, strain.id.strain_id) + self.assertEqual(new_strain.pathogenicity, 'illness') + + new_strain.pathogenicity = None + updated_strain = self.client.update(STRAIN_WS, new_strain) + self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id) + self.assertIsNone(updated_strain.pathogenicity) + + retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id) + self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id) + self.assertIsNone(retrieved_strain.pathogenicity) + finally: + if record_id is not None: + self.client.delete_by_id(STRAIN_WS, record_id) + + def test_search_by_accession_number(self): + accession_number = "BEA 0014B" + strain = retrieve_strain_by_accession_number(self.client, accession_number) + self.assertEqual(strain.id.strain_id, accession_number) + + def test_search_by_accession_number(self): + accession_number = "BEA 0014B_" + strain = retrieve_strain_by_accession_number(self.client, accession_number) + self.assertFalse(strain) + + +class BiolomicsClientGrowthMediaTest(unittest.TestCase): + def setUp(self): + self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID, + SECRET_ID, USERNAME, PASSWORD) + + def xtest_growth_media_by_name(self): + gm = self.client.retrieve('growth_media', 'AAA') + self.assertEqual(gm['Record Id'], 1) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', + # 'BiolomicsWriter.test_mirri_excel_parser_invalid'] + unittest.main() diff --git a/tests/biolomics/utils.py b/tests/biolomics/utils.py new file mode 100644 index 0000000..0674577 --- /dev/null +++ b/tests/biolomics/utils.py @@ -0,0 +1,99 @@ +from mirri.biolomics.serializers.strain import StrainMirri +from mirri.entities.strain import StrainId, OrganismType +from mirri.entities.sequence import GenomicSequence +from mirri.entities.date_range import DateRange +from mirri.entities.publication import Publication +from mirri.settings import NAGOYA_NO_RESTRICTIONS + +VERSION = 'v2' +SERVER_URL = 'https://webservices.bio-aware.com/mirri_test' + + +def create_full_data_strain(): + strain = StrainMirri() + + strain.id.number = "1" + strain.id.collection = "TESTCC" + strain.id.url = "https://cect/2342" + + strain.restriction_on_use = "no_restriction" + strain.nagoya_protocol = NAGOYA_NO_RESTRICTIONS + strain.abs_related_files = ['https://example.com'] + strain.mta_files = ['https://example.com'] + strain.other_numbers.append(StrainId(collection="aaa", number="a")) + strain.other_numbers.append(StrainId(collection="aaa3", number="a3")) + strain.is_from_registered_collection = False + strain.risk_group = '1' + strain.is_potentially_harmful = True + strain.is_subject_to_quarantine = False + + strain.taxonomy.organism_type = [OrganismType(2)] + strain.taxonomy.genus = 'Escherichia' + strain.taxonomy.species = 'coli' + strain.taxonomy.interspecific_hybrid = False + strain.taxonomy.infrasubspecific_name = 'serovar tete' + strain.taxonomy.comments = 'lalalalla' + + strain.status = "type of Bacillus alcalophilus" + strain.history = 'firstplave < seconn place < third place' + + strain.deposit.who = "NCTC, National Collection of Type Cultures - NCTC, London, United Kingdom of Great Britain and Northern Ireland." + strain.deposit.date = DateRange(year=1985, month=5, day=2) + strain.catalog_inclusion_date = DateRange(year=1985, month=5, day=2) + + strain.collect.location.country = "ESP" + strain.collect.location.state = "una state" + strain.collect.location.municipality = "one municipality" + strain.collect.location.longitude = 23.3 + strain.collect.location.latitude = 23.3 + strain.collect.location.altitude = 121 + strain.collect.location.site = "somewhere in the world" + strain.collect.habitat_ontobiotope = "OBT:000190" + strain.collect.habitat = 'some habitat' + strain.collect.who = "the collector" + strain.collect.date = DateRange(year=1991) + + strain.isolation.date = DateRange(year=1900) + strain.isolation.who = 'the isolator' + strain.isolation.substrate_host_of_isolation = 'some substrate' + + # already existing media in test_mirri + + strain.growth.recommended_temp = {'min': 30, 'max': 30} + strain.growth.recommended_media = ["AAA"] + strain.growth.tested_temp_range = {'min': 29, 'max': 32} + + strain.form_of_supply = ["Agar", "Lyo"] + + #strain.other_denominations = ["lajdflasjdldj"] + + gen_seq = GenomicSequence() + gen_seq.marker_id = "pepe" + gen_seq.marker_type = "16S rRNA" + strain.genetics.markers.append(gen_seq) + strain.genetics.ploidy = 9 + strain.genetics.genotype = 'some genotupe' + strain.genetics.gmo = True + strain.genetics.gmo_construction = 'instructrion to build' + strain.genetics.mutant_info = 'x-men' + strain.genetics.sexual_state = 'MT+A' + strain.genetics.plasmids = ['asda'] + strain.genetics.plasmids_in_collections = ['asdasda'] + + pub = Publication() + pub.title = "The genus Amylomyces" + strain.publications = [pub] + + strain.plant_pathogenicity_code = 'PATH:001' + strain.pathogenicity = 'illness' + strain.enzyme_production = 'some enzimes' + strain.production_of_metabolites = 'big factory of cheese' + strain.applications = 'health' + + strain.remarks = 'no remarks for me' + return strain + + +if __name__ == '__main__': + strain = create_full_data_strain() + print(strain.collect.habitat_ontobiotope) diff --git a/tests/data/invalid_content.mirri.xlsx b/tests/data/invalid_content.mirri.xlsx new file mode 100644 index 0000000..9b2d8c9 Binary files /dev/null and b/tests/data/invalid_content.mirri.xlsx differ diff --git a/tests/data/invalid_excel.mirri.json b/tests/data/invalid_excel.mirri.json new file mode 100644 index 0000000..e7cf986 --- /dev/null +++ b/tests/data/invalid_excel.mirri.json @@ -0,0 +1,5 @@ +{ + "key1": "value1", + "key2": "value2", + "key3": "value3" +} \ No newline at end of file diff --git a/tests/data/invalid_structure.mirri.xlsx b/tests/data/invalid_structure.mirri.xlsx new file mode 100644 index 0000000..274fec1 Binary files /dev/null and b/tests/data/invalid_structure.mirri.xlsx differ diff --git a/tests/data/valid.mirri.full.xlsx b/tests/data/valid.mirri.full.xlsx new file mode 100644 index 0000000..e141c19 Binary files /dev/null and b/tests/data/valid.mirri.full.xlsx differ diff --git a/tests/data/valid.mirri.xlsx b/tests/data/valid.mirri.xlsx new file mode 100644 index 0000000..9685a80 Binary files /dev/null and b/tests/data/valid.mirri.xlsx differ diff --git a/tests/test_entities.py b/tests/test_entities.py new file mode 100644 index 0000000..13db9ca --- /dev/null +++ b/tests/test_entities.py @@ -0,0 +1,318 @@ +""" +Created on 2020(e)ko abe. 2(a) + +@author: peio +""" + +import unittest + +from mirri.entities.publication import Publication +from mirri.entities.date_range import DateRange +from mirri.entities.location import Location +from mirri.entities.sequence import GenomicSequence +from mirri.entities.strain import ( + Collect, + Deposit, + Isolation, + ValidationError, + OrganismType, + Strain, + StrainId, + Taxonomy, +) +from mirri.settings import ( + COLLECT, + COUNTRY, + DATE_OF_ISOLATION, + DEPOSIT, + DEPOSITOR, + GENETICS, + GROWTH, + ISOLATED_BY, + ISOLATION, + LOCATION, + MARKERS, + NAGOYA_DOCS_AVAILABLE, + NAGOYA_PROTOCOL, + ORGANISM_TYPE, + OTHER_CULTURE_NUMBERS, + PLOIDY, + RECOMMENDED_GROWTH_MEDIUM, + TAXONOMY, + DATE_OF_INCLUSION, NO_RESTRICTION +) +from mirri.validation.entity_validators import validate_strain + + +class TestDataRange(unittest.TestCase): + def test_data_range_init(self): + dr = DateRange() + + self.assertFalse(dr) + + self.assertEqual(dr.__str__(), "") + self.assertEqual(dr.range["start"], None) + self.assertEqual(dr.range["end"], None) + + dr.strpdate("2012") + self.assertEqual(dr.strfdate, "2012----") + self.assertTrue(dr) + + dr.strpdate("2012----") + self.assertEqual(dr.strfdate, "2012----") + + dr.strpdate("201212--") + self.assertEqual(dr.strfdate, "201212--") + try: + dr.strpdate("201213--") + self.fail() + except ValueError: + pass + + try: + dr = DateRange(year=2012, month=13) + self.fail() + except ValueError: + pass + + dr = DateRange(year=2020) + self.assertEqual(dr.strfdate, "2020----") + + dr2 = dr.strpdate("2012") + self.assertEqual(dr2.range["start"].year, 2012) + self.assertEqual(dr2.range["start"].month, 1) + self.assertEqual(dr2.range["start"].day, 1) + + self.assertEqual(dr2.range["end"].year, 2012) + self.assertEqual(dr2.range["end"].month, 12) + self.assertEqual(dr2.range["end"].day, 31) + + +class TestCollect(unittest.TestCase): + def test_collect_basic(self): + collect = Collect() + self.assertEqual(collect.dict(), {}) + + collect.location.country = "ESP" + collect.date = DateRange().strpdate("2012----") + + collect.who = "pepito" + self.assertEqual( + dict(collect.dict()), + { + "location": {"countryOfOriginCode": "ESP"}, + "collected_by": "pepito", + "date_of_collection": "2012----", + }, + ) + self.assertEqual(collect.__str__(), + "Collected: Spain in 2012---- by pepito") + + +class TestOrganismType(unittest.TestCase): + def test_basic_usage(self): + org_type = OrganismType(2) + self.assertEqual(org_type.name, "Archaea") + self.assertEqual(org_type.code, 2) + try: + org_type.ko = 'a' + self.fail() + except TypeError: + pass + + org_type = OrganismType("Archaea") + + +class TestTaxonomy(unittest.TestCase): + def test_taxonomy_basic(self): + taxonomy = Taxonomy() + self.assertEqual(taxonomy.dict(), {}) + self.assertFalse(taxonomy) + + def test_taxonomy_with_data(self): + taxonomy = Taxonomy() + taxonomy.genus = "Bacilus" + taxonomy.organism_type = [OrganismType("Archaea")] + taxonomy.species = "vulgaris" + self.assertEqual(taxonomy.long_name, "Bacilus vulgaris") + + # print(taxonomy.dict()) + + +class TestLocation(unittest.TestCase): + def test_empty_init(self): + loc = Location() + self.assertEqual(loc.dict(), {}) + self.assertFalse(loc) + + def test_add_data(self): + loc = Location() + loc.country = "esp" + self.assertEqual(loc.dict(), {COUNTRY: "esp"}) + loc.state = None + self.assertEqual(loc.dict(), {COUNTRY: "esp"}) + + +class TestStrain(unittest.TestCase): + def test_empty_strain(self): + strain = Strain() + self.assertEqual(strain.dict(), {}) + + def test_strain_add_data(self): + strain = Strain() + + strain.id.number = "5433" + strain.id.collection = "CECT" + strain.id.url = "https://cect/2342" + + try: + strain.nagoya_protocol = "asdas" + self.fail() + except ValidationError: + pass + + strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE + strain.dict()[NAGOYA_PROTOCOL] = NAGOYA_DOCS_AVAILABLE + + strain.collect.location.country = "ESP" + + self.assertEqual(strain.dict()[COLLECT][LOCATION][COUNTRY], "ESP") + + strain.genetics.ploidy = 9 + self.assertEqual(strain.dict()[GENETICS][PLOIDY], 9) + + strain.growth.recommended_media = ["asd"] + strain.isolation.date = DateRange(year=1900) + self.assertEqual(strain.dict()[ISOLATION] + [DATE_OF_ISOLATION], "1900----") + + strain.deposit.who = "pepe" + self.assertEqual(strain.dict()[DEPOSIT][DEPOSITOR], "pepe") + + strain.growth.recommended_media = ["11"] + self.assertEqual(strain.dict()[GROWTH] + [RECOMMENDED_GROWTH_MEDIUM], ["11"]) + + strain.taxonomy.organism_type = [OrganismType(2)] + self.assertEqual( + strain.dict()[TAXONOMY][ORGANISM_TYPE], [ + {"code": 2, "name": "Archaea"}] + ) + + strain.taxonomy.organism_type = [OrganismType("Algae")] + self.assertEqual( + strain.dict()[TAXONOMY][ORGANISM_TYPE], [ + {"code": 1, "name": "Algae"}] + ) + + strain.other_numbers.append(StrainId(collection="aaa", number="a")) + strain.other_numbers.append(StrainId(collection="aaa3", number="a3")) + self.assertEqual( + strain.dict()[OTHER_CULTURE_NUMBERS], + [ + {"collection_code": "aaa", "accession_number": "a"}, + {"collection_code": "aaa3", "accession_number": "a3"}, + ], + ) + strain.form_of_supply = ["Agar", "Lyo"] + gen_seq = GenomicSequence() + self.assertEqual(gen_seq.dict(), {}) + gen_seq.marker_id = "pepe" + gen_seq.marker_type = "16S rRNA" + strain.genetics.markers.append(gen_seq) + self.assertEqual( + strain.dict()[GENETICS][MARKERS], + [{"marker_type": "16S rRNA", "INSDC": "pepe"}], + ) + + strain.collect.habitat_ontobiotope = "OBT:111111" + self.assertEqual(strain.collect.habitat_ontobiotope, "OBT:111111") + + try: + strain.collect.habitat_ontobiotope = "OBT:11111" + self.fail() + except ValidationError: + pass + + # publications + try: + strain.publications = 1 + self.fail() + except ValidationError: + pass + pub = Publication() + pub.id = "1" + try: + strain.publications = pub + self.fail() + except ValidationError: + pass + + strain.publications = [pub] + self.assertEqual(strain.publications[0].id, "1") + + strain.catalog_inclusion_date = DateRange(year=1992) + self.assertEqual(strain.dict()[DATE_OF_INCLUSION], '1992----') + + import pprint + + pprint.pprint(strain.dict()) + + def test_strain_validation(self): + strain = Strain() + strain.form_of_supply = ['Lyo'] + + return + + errors = validate_strain(strain) + self.assertEqual(len(errors), 10) + + strain.id.collection = 'test' + strain.id.number = '1' + + + errors = validate_strain(strain) + self.assertEqual(len(errors), 9) + + strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE + strain.restriction_on_use = NO_RESTRICTION + strain.risk_group = 1 + strain.taxonomy.organism_type = [OrganismType(4)] + strain.taxonomy.hybrids = ['Sac lac', 'Sac lcac3'] + strain.growth.recommended_media = ['aa'] + strain.growth.recommended_temp = {'min': 2, 'max':5} + strain.form_of_supply = ['lyo'] + strain.collect.location.country = 'ESP' + errors = validate_strain(strain) + self.assertFalse(errors) + + +class TestIsolation(unittest.TestCase): + def test_iniatialize_isollation(self): + isolation = Isolation() + self.assertEqual(isolation.dict(), {}) + isolation.who = "pepito" + self.assertTrue(ISOLATED_BY in isolation.dict()) + isolation.date = DateRange().strpdate("2012----") + self.assertTrue(DATE_OF_ISOLATION in isolation.dict()) + + try: + isolation.location.site = "spain" + self.fail() + except (ValueError, AttributeError): + pass + + +class TestGenomicSequence(unittest.TestCase): + def test_empty_init(self): + gen_seq = GenomicSequence() + self.assertEqual(gen_seq.dict(), {}) + gen_seq.marker_id = "pepe" + gen_seq.marker_type = "16S rRNA" + self.assertEqual(gen_seq.dict(), { + "marker_type": "16S rRNA", "INSDC": "pepe"}) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', 'TestStrain'] + unittest.main() diff --git a/tests/test_parsers.py b/tests/test_parsers.py new file mode 100644 index 0000000..96d5f8b --- /dev/null +++ b/tests/test_parsers.py @@ -0,0 +1,51 @@ +from mirri.entities.strain import ValidationError +import unittest +from pathlib import Path +from pprint import pprint +from mirri.io.parsers.mirri_excel import parse_mirri_excel + +TEST_DATA_DIR = Path(__file__).parent / "data" + + +class MirriExcelTests(unittest.TestCase): + + def test_mirri_excel_parser(self): + in_path = TEST_DATA_DIR / "valid.mirri.xlsx" + with in_path.open("rb") as fhand: + parsed_data = parse_mirri_excel(fhand, version="20200601") + + medium = parsed_data["growth_media"][0] + self.assertEqual("1", medium.acronym) + self.assertEqual(medium.description, "NUTRIENT BROTH/AGAR I") + + strains = list(parsed_data["strains"]) + strain = strains[0] + self.assertEqual(strain.publications[0].id, 1) + self.assertEqual(strain.publications[0].title, 'Cosa') + self.assertEqual(strain.id.number, "1") + pprint(strain.dict()) + + def xtest_mirri_excel_parser_invalid_fail(self): + in_path = TEST_DATA_DIR / "invalid.mirri.xlsx" + with in_path.open("rb") as fhand: + try: + parse_mirri_excel(fhand, version="20200601") + self.fail() + except ValidationError: + pass + + def xtest_mirri_excel_parser_invalid(self): + in_path = TEST_DATA_DIR / "invalid.mirri.xlsx" + with in_path.open("rb") as fhand: + parsed_data = parse_mirri_excel( + fhand, version="20200601") + + errors = parsed_data["errors"] + for _id, _errors in errors.items(): + print(_id, _errors) + + +if __name__ == "__main__": + # import sys;sys.argv = ['', + # 'MirriExcelTests.test_mirri_excel_parser_invalid'] + unittest.main() diff --git a/tests/test_validation.py b/tests/test_validation.py new file mode 100644 index 0000000..f809a5d --- /dev/null +++ b/tests/test_validation.py @@ -0,0 +1,589 @@ +from datetime import datetime +import unittest +from pathlib import Path +from itertools import chain + +from mirri.validation.tags import ( + CHOICES, + COORDINATES, + CROSSREF, + CROSSREF_NAME, + DATE, + MATCH, + MISSING, + MULTIPLE, + NUMBER, + REGEXP, + SEPARATOR, + TAXON, + TYPE, + UNIQUE, + VALUES +) + +from mirri.validation.excel_validator import ( + is_valid_choices, + is_valid_coords, + is_valid_crossrefs, + is_valid_date, + is_valid_missing, + is_valid_number, + is_valid_regex, + is_valid_taxon, + is_valid_unique, + is_valid_file, + validate_mirri_excel, +) + + +TEST_DATA_DIR = Path(__file__).parent / "data" +TS_VALUE = "value" +TS_CONF = "conf" +TS_ASSERT = "assert_func" + + +class MirriExcelValidationTests(unittest.TestCase): + + def test_validation_structure(self): + in_path = TEST_DATA_DIR / "invalid_structure.mirri.xlsx" + with in_path.open("rb") as fhand: + error_log = validate_mirri_excel(fhand) + + entities = [] + err_codes = [] + for ett, errors in error_log.get_errors().items(): + entities.append(ett) + err_codes.extend([err.code for err in errors]) + + self.assertIn("EFS", entities) + self.assertIn("STD", entities) + self.assertIn("GOD", entities) + self.assertIn("GMD", entities) + + self.assertIn("EFS03", err_codes) + self.assertIn("EFS06", err_codes) + self.assertIn("EFS08", err_codes) + self.assertIn("GOD06", err_codes) + self.assertIn("GMD01", err_codes) + self.assertIn("STD05", err_codes) + self.assertIn("STD08", err_codes) + self.assertIn("STD12", err_codes) + + def test_validation_content(self): + in_path = TEST_DATA_DIR / "invalid_content.mirri.xlsx" + with in_path.open("rb") as fhand: + error_log = validate_mirri_excel(fhand) + + entities = [] + err_codes = [] + for ett, errors in error_log.get_errors().items(): + entities.append(ett) + err_codes.extend([err.code for err in errors]) + + self.assertTrue(len(err_codes) > 0) + + self.assertNotIn("EFS", entities) + self.assertIn("STD", entities) + self.assertIn("GOD", entities) + self.assertIn("GID", entities) + + self.assertIn("GOD04", err_codes) + self.assertIn("GOD07", err_codes) + self.assertIn("GID03", err_codes) + self.assertIn("STD11", err_codes) + self.assertIn("STD15", err_codes) + self.assertIn("STD22", err_codes) + self.assertIn("STD04", err_codes) + self.assertIn("STD10", err_codes) + self.assertIn("STD07", err_codes) + self.assertIn("STD14", err_codes) + self.assertIn("STD16", err_codes) + + def test_validation_valid(self): + in_path = TEST_DATA_DIR / "valid.mirri.xlsx" + with in_path.open("rb") as fhand: + error_log = validate_mirri_excel(fhand) + + self.assertTrue(len(error_log.get_errors()) == 0) + + +class ValidatoionFunctionsTest(unittest.TestCase): + + def test_is_valid_regex(self): + tests = [ + { + TS_VALUE: "abcDEF", + TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "123456", + TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "123456", + TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "abcdef", + TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "abc 123", + TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "123 abc", + TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "123 ", + TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_regex(value, conf)) + + def test_is_valid_choices(self): + tests = [ + { + TS_VALUE: "1", + TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "1, 3", + TS_CONF: { + TYPE: CHOICES, + VALUES: ["1", "2", "3", "4"], + MULTIPLE: True, + SEPARATOR: "," + }, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "5", + TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_choices(value, conf)) + + def test_is_valid_crossref(self): + tests = [ + { + TS_VALUE: "abc", + TS_CONF: { + TYPE: CROSSREF, + CROSSREF_NAME: "values", + "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, + }, + TS_ASSERT: self.assertTrue, + }, + { + TS_VALUE: "123", + TS_CONF: { + TYPE: CROSSREF, + CROSSREF_NAME: "values", + "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, + }, + TS_ASSERT: self.assertFalse, + }, + { + TS_VALUE: "abc, def", + TS_CONF: { + TYPE: CROSSREF, + CROSSREF_NAME: "values", + "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, + MULTIPLE: True, + SEPARATOR: ",", + }, + TS_ASSERT: self.assertTrue, + }, + { + TS_VALUE: "abc, 123", + TS_CONF: { + TYPE: CROSSREF, + CROSSREF_NAME: "values", + "crossrefs_pointer": {"values": ["abc", "def", "ghi"]}, + MULTIPLE: True, + SEPARATOR: ",", + }, + TS_ASSERT: self.assertFalse, + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_crossrefs(value, conf)) + + def test_is_valid_missing(self): + tests = [ + { + TS_VALUE: 1, + TS_CONF: {TYPE: MISSING}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "abc", + TS_CONF: {TYPE: MISSING}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: None, + TS_CONF: {TYPE: MISSING}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_missing(value, conf)) + + def test_is_valid_date(self): + tests = [ + { + TS_VALUE: '2020-04-07', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: '2020/04/07', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: datetime(2021, 5, 1), + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: '2020-05', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: '2020/05', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 2020, + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: '2021 05 01', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: '04-07-2020', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: '2021-02-31', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: '2021-15', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: '15-2021', + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: 3000, + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: -2020, + TS_CONF: {TYPE: DATE}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_date(value, conf)) + + def test_is_valid_coordinates(self): + tests = [ + { + TS_VALUE: "23; 50", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "-90; -100", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "90; 100", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "0; 0", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "10; 20; 5", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "10; 20; -5", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "91; 50", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "87; 182", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "-200; 182", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "20, 40", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: "abc def", + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: 123, + TS_CONF: {TYPE: COORDINATES}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_coords(value, conf)) + + def test_is_valid_number(self): + tests = [ + { + TS_VALUE: 1, + TS_CONF: {TYPE: NUMBER}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 2.5, + TS_CONF: {TYPE: NUMBER}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "10", + TS_CONF: {TYPE: NUMBER}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "10.5", + TS_CONF: {TYPE: NUMBER}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 5, + TS_CONF: {TYPE: NUMBER, "min": 0}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 5, + TS_CONF: {TYPE: NUMBER, "max": 10}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 5, + TS_CONF: {TYPE: NUMBER, "min": 0, "max": 10}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "hello", + TS_CONF: {TYPE: NUMBER}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: 10, + TS_CONF: {TYPE: NUMBER, "max": 5}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: 0, + TS_CONF: {TYPE: NUMBER, "min": 5}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_number(value, conf)) + + def test_is_valid_taxon(self): + tests = [ + { + TS_VALUE: 'sp. species', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 'spp species subsp. subspecies', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 'spp species subsp. subspecies var. variety', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 'spp taxon', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 'Candidaceae', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: 'sp sp species', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertFalse + }, + { + TS_VALUE: 'spp species abc. def', + TS_CONF: {TYPE: TAXON}, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_taxon(value, conf)) + + def test_is_valid_unique(self): + tests = [ + { + TS_VALUE: "abc", + TS_CONF: { + TYPE: UNIQUE, + "label": "values", + "shown_values": {} + }, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "jkl", + TS_CONF: { + TYPE: UNIQUE, + "label": "values", + "shown_values": { + "values": {"abc": '', + "def": '', + "ghi": ''}, + } + }, + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: "abc", + TS_CONF: { + TYPE: UNIQUE, + "label": "values", + "shown_values": { + "values": {"abc": '', + "def": '', + "ghi": ''}, + } + }, + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + conf = test[TS_CONF] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_unique(value, conf)) + + def test_is_valid_file(self): + tests = [ + { + TS_VALUE: TEST_DATA_DIR / "invalid_structure.mirri.xlsx", + TS_ASSERT: self.assertTrue + }, + { + TS_VALUE: TEST_DATA_DIR / "invalid_excel.mirri.json", + TS_ASSERT: self.assertFalse + }, + ] + + for test in tests: + value = test[TS_VALUE] + assert_func = test[TS_ASSERT] + with self.subTest(value=value): + assert_func(is_valid_file(value,)) + + +if __name__ == "__main__": + import sys + # sys.argv = ['', + # 'ValidatoionFunctionsTest.test_is_valid_regex'] + unittest.main() diff --git a/tests/test_writers.py b/tests/test_writers.py new file mode 100644 index 0000000..94a8808 --- /dev/null +++ b/tests/test_writers.py @@ -0,0 +1,24 @@ + +import unittest +from pathlib import Path +from mirri.io.writers.mirri_excel import write_mirri_excel +from mirri.io.parsers.mirri_excel import parse_mirri_excel + +TEST_DATA_DIR = Path(__file__).parent / "data" + + +class MirriExcelTests(unittest.TestCase): + def test_valid_excel(self): + in_path = TEST_DATA_DIR / "valid.mirri.full.xlsx" + parsed_data = parse_mirri_excel(in_path.open('rb'), version="20200601") + strains = parsed_data["strains"] + growth_media = parsed_data["growth_media"] + out_path = Path("/tmp/test.xlsx") + + write_mirri_excel(out_path, strains, growth_media, version="20200601") + + +if __name__ == "__main__": + # import sys;sys.argv = ['', + # 'BiolomicsWriter.test_mirri_excel_parser_invalid'] + unittest.main()