First import

This commit is contained in:
Jose Miguel López Coronado 2022-02-18 12:09:05 +01:00
commit 332876f58c
73 changed files with 12572 additions and 0 deletions

19
README.md Normal file
View File

@ -0,0 +1,19 @@
# MIRRI Utils
## Installation
> pip install path_to_package.tar.gz
## Description
A small set of utilities to deal with Mirri Data.
- A data class to deal with strain data.
- An excel reader for mirri specification
- An excel validator for mirri specification
- An excel writer to create the excel with MIRRI specifications

View File

@ -0,0 +1,77 @@
#!/usr/bin/env python3
import argparse
import sys
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-a', '--accession_number', required=True,
help='Delete the duplicated items in database for the given accession number')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
args = parser.parse_args()
return {'accession_number': args.accession_number, 'user': args.ws_user,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def main():
args = get_cmd_args()
out_fhand = sys.stdout
client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'])
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": args['accession_number']}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
result = client.search(STRAIN_WS, query=query)
total = result["total"]
if total == 0:
out_fhand.write('Accession not in database\n')
sys.exit(0)
return None
elif total == 1:
out_fhand.write('Accession is not duplicated\n')
sys.exit(0)
print(f'Duplicates found: {total}. removing duplicates')
duplicated_ids = [record.record_id for record in result['records']]
for duplicated_id in duplicated_ids[:-1]:
client.delete_by_id(STRAIN_WS, duplicated_id)
if __name__ == '__main__':
main()

91
bin/delete_mirri_data.py Normal file
View File

@ -0,0 +1,91 @@
#!/usr/bin/env python3
import argparse
import sys
from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS, STRAIN_WS
from mirri.io.parsers.mirri_excel import parse_mirri_excel
from mirri.validation.excel_validator import validate_mirri_excel
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i', '--input', help='Validated Excel file',
type=argparse.FileType('rb'), required=True)
parser.add_argument('-v', '--spec_version', default='20200601',
help='Version of he specification of the given excel file')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
parser.add_argument('-f', '--force_update', required=False,
action='store_true',
help='Use it if you want to update the existing strains')
args = parser.parse_args()
return {'input_fhand': args.input, 'user': args.ws_user,
'version': args.spec_version,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret, 'update': args.force_update}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def main():
args = get_cmd_args()
input_fhand = args['input_fhand']
spec_version = args['version']
out_fhand = sys.stderr
error_log = validate_mirri_excel(input_fhand, version=spec_version)
errors = error_log.get_errors()
if errors:
write_errors_in_screen(errors, out_fhand)
sys.exit(1)
input_fhand.seek(0)
parsed_objects = parse_mirri_excel(input_fhand, version=spec_version)
strains = list(parsed_objects['strains'])
growth_media = list(parsed_objects['growth_media'])
client = BiolomicsMirriClient(server_url=SERVER_URL, api_version= 'v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'])
for gm in growth_media:
try:
client.delete_by_name(GROWTH_MEDIUM_WS, gm.acronym)
except ValueError as error:
print(error)
continue
print(f'Growth medium {gm.acronym} deleted')
for strain in strains:
ws_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id)
if ws_strain is not None:
client.delete_by_id(STRAIN_WS, ws_strain.record_id)
print(f'Strain {strain.id.strain_id} deleted')
else:
print(f'Strain {strain.id.strain_id} not in database')
if __name__ == '__main__':
main()

View File

@ -0,0 +1,182 @@
#!/usr/bin/env python3
import argparse
import sys
from collections import Counter
from mirri.biolomics.pipelines.growth_medium import get_or_create_or_update_growth_medium
from mirri.biolomics.pipelines.strain import get_or_create_or_update_strain
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.io.parsers.mirri_excel import parse_mirri_excel
from mirri.validation.excel_validator import validate_mirri_excel
TEST_SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
PROD_SERVER_URL = 'https://webservices.bio-aware.com/mirri'
def get_cmd_args():
desc = "Upload strains to MIRRI-IS"
parser = argparse.ArgumentParser(description=desc)
parser.add_argument('-i', '--input', help='Validated Excel file',
type=argparse.FileType('rb'), required=True)
parser.add_argument('-v', '--spec_version', default='20200601',
help='Version of he specification of the given excel file')
parser.add_argument('-u', '--ws_user', help='Username of the web service',
required=True)
parser.add_argument('-p', '--ws_password', required=True,
help='Password of the web service user')
parser.add_argument('-c', '--client_id', required=True,
help='Client id of the web service')
parser.add_argument('-s', '--client_secret', required=True,
help='Client secret of the web service')
parser.add_argument('--force_update', required=False,
action='store_true',
help='Use it if you want to update the existing strains')
parser.add_argument('--verbose', action='store_true',
help='use it if you want a verbose output')
parser.add_argument('--prod', action='store_true',
help='Use production server')
parser.add_argument('--dont_add_gm', action='store_false',
help="Don't add growth media", default=True)
parser.add_argument('--dont_add_strains', action='store_false',
help="Don't add growth media", default=True)
parser.add_argument('--skip_first_num', type=int,
help='skip first X strains to the tool')
args = parser.parse_args()
return {'input_fhand': args.input, 'user': args.ws_user,
'version': args.spec_version,
'password': args.ws_password, 'client_id': args.client_id,
'client_secret': args.client_secret, 'update': args.force_update,
'verbose': args.verbose, 'use_production_server': args.prod,
'add_gm': args.dont_add_gm, 'add_strains': args.dont_add_strains,
'skip_first_num': args.skip_first_num}
def write_errors_in_screen(errors, fhand=sys.stderr):
for key, errors_by_type in errors.items():
fhand.write(f'{key}\n')
fhand.write('-' * len(key) + '\n')
for error in errors_by_type:
if error.pk:
fhand.write(f'{error.pk}: ')
fhand.write(f'{error.message} - {error.code}\n')
fhand.write('\n')
def create_or_upload_strains(client, strains, update=False, counter=None,
out_fhand=None, seek=None):
for index, strain in enumerate(strains):
if seek is not None and index < seek:
continue
# if strain.id.strain_id != 'CECT 5766':
# continue
result = get_or_create_or_update_strain(client, strain, update=update)
new_strain = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'{index}: Strain {new_strain.id.strain_id}: {result_state}\n')
# break
def create_or_upload_growth_media(client, growth_media, update=False, counter=None,
out_fhand=None):
for gm in growth_media:
result = get_or_create_or_update_growth_medium(client, gm, update)
new_gm = result['record']
created = result['created']
updated = result.get('updated', False)
if updated:
result_state = 'updated'
elif created:
result_state = 'created'
else:
result_state = 'not modified'
if counter is not None:
counter[result_state] += 1
if out_fhand is not None:
out_fhand.write(f'Growth medium {new_gm.record_name}: {result_state}\n')
def main():
args = get_cmd_args()
input_fhand = args['input_fhand']
spec_version = args['version']
out_fhand = sys.stdout
error_log = validate_mirri_excel(input_fhand, version=spec_version)
errors = error_log.get_errors()
skip_first_num = args['skip_first_num']
if errors:
write_errors_in_screen(errors, out_fhand)
sys.exit(1)
input_fhand.seek(0)
parsed_objects = parse_mirri_excel(input_fhand, version=spec_version)
strains = list(parsed_objects['strains'])
growth_media = list(parsed_objects['growth_media'])
server_url = PROD_SERVER_URL if args['use_production_server'] else TEST_SERVER_URL
client = BiolomicsMirriClient(server_url=server_url, api_version='v2',
client_id=args['client_id'],
client_secret=args['client_secret'],
username=args['user'],
password=args['password'],
verbose=args['verbose'])
if args['add_gm']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_growth_media(client, growth_media, update=args['update'],
counter=counter, out_fhand=out_fhand)
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Growth media upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('Rolling back\n')
client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Growth Media', out_fhand)
if args['add_strains']:
client.start_transaction()
counter = Counter()
try:
create_or_upload_strains(client, strains, update=args['update'],
counter=counter,
out_fhand=out_fhand, seek=skip_first_num)
client.finish_transaction()
except (Exception, KeyboardInterrupt) as error:
out_fhand.write('There were some errors in the Strain upload\n')
out_fhand.write(str(error) + '\n')
out_fhand.write('rolling back\n')
# client.rollback()
raise
client.finish_transaction()
show_stats(counter, 'Strains', out_fhand)
def show_stats(counter, kind, out_fhand):
out_fhand.write(f'{kind}\n')
line = ''.join(['-'] * len(kind))
out_fhand.write(f"{line}\n")
for kind2, value in counter.most_common(5):
out_fhand.write(f'{kind2}: {value}\n')
out_fhand.write('\n')
if __name__ == '__main__':
main()

19
bin/validate.py Normal file
View File

@ -0,0 +1,19 @@
#!/usr/bin/env python
import sys
from pathlib import Path
from mirri.validation.excel_validator import validate_mirri_excel
import warnings
warnings.simplefilter("ignore")
def main():
path = Path(sys.argv[1])
error_log = validate_mirri_excel(path.open("rb"))
for errors in error_log.get_errors().values():
for error in errors:
print(error.pk, error.message, error.code)
if __name__ == "__main__":
main()

Binary file not shown.

Binary file not shown.

61
mirri/TODO.txt Normal file
View File

@ -0,0 +1,61 @@
Ontobiotope term. just one field in dataset, two fields in biolomics
Altitude. Field and in Coordinates
Geographic origin: field and Entry in other table
Ploidy: How is this field formated?haploid/diploid or 1, 2, 3...
Best strategy:
My class has
- strain data
- geographic data
- literature
- sequences
No not a valid value for Strain from a registered collection, Allowed values: ?. no. yes
yes not a valid value for GMO, Allowed values: ?. No. Yes
Organism Type:
firstuppercase in deposit
lower case in retrieve
Taxon name is a list in retrieve
null values:
'Comment on taxonomy' = '' could be null
'Coordinates of geographic origin':{Longitude, lati... 'NaN' could be null
'Date of inclusion in the catalogue' = '' could be null
'Enzyme production'= '' could be null
'Ploidy':'?' could be null
Deposit date
--------------------------------------------
- Assign seq to strain in strain serializers
- Fields in ws that are not in our specification. What to do with them
- Type description - IGNORE
- Associated documents - IGNORE
- Data provided by - IGNORE
- Orders - IGNORE
- MTA text - IGNORE
- Catalog URL -
- Publication RecordName assignation. How to do it?
- Sequence RecordName assignation. How to do it
- Publications serializer improvement
------------------------------------------------------
Marker Name. Which options are allowed in WS and how they map to the types in specifications?
update: it should be done in the detail url.
interspecific_hybrid set to "no" by default in web service if no value given.
Tested temperature growth range {'max': 0.0, 'min': 0.0} when added empty
Very slow: A normal search action takes

21
mirri/__init__.py Normal file
View File

@ -0,0 +1,21 @@
import functools
def rgetattr(obj, attr, *args):
def _getattr(obj, attr):
return getattr(obj, attr, *args)
return functools.reduce(_getattr, [obj] + attr.split('.'))
def rsetattr(obj, attr, val):
pre, _, post = attr.rpartition('.')
return setattr(rgetattr(obj, pre) if pre else obj, post, val)
# using wonder's beautiful simplification:
# https://stackoverflow.com/questions/31174295/getattr-and-setattr-on-nested-objects/31174427?noredirect=1#comment86638618_31174427
class ValidationError(Exception):
pass

View File

View File

View File

@ -0,0 +1,44 @@
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS
from mirri.entities.growth_medium import GrowthMedium
from mirri.biolomics.serializers.growth_media import get_growth_medium_record_name
def get_or_create_or_update_growth_medium(client: BiolomicsMirriClient,
growth_medium: GrowthMedium,
update=False):
response = get_or_create_growth_medium(client, growth_medium)
new_gm = response['record']
created = response['created']
if created:
return {'record': new_gm, 'created': created, 'updated': False}
if not update:
return {'record': new_gm, 'created': False, 'updated': False}
# compare_strains
if growth_medium.is_equal(new_gm, exclude_fields=['record_id', 'record_name', 'acronym']):
records_are_different = False
else:
growth_medium.update(new_gm, include_fields=['record_id', 'record_name'])
records_are_different = True
if records_are_different:
updated_gm = client.update(GROWTH_MEDIUM_WS, growth_medium)
updated = True
else:
updated_gm = new_gm
updated = False
return {'record': updated_gm, 'created': False, 'updated': updated}
def get_or_create_growth_medium(client: BiolomicsMirriClient,
growth_medium: GrowthMedium):
record_name = get_growth_medium_record_name(growth_medium)
gm = client.retrieve_by_name(GROWTH_MEDIUM_WS, record_name)
if gm is not None:
return {'record': gm, 'created': False}
new_gm = client.create(GROWTH_MEDIUM_WS, growth_medium)
return {'record': new_gm, 'created': True}

View File

@ -0,0 +1,122 @@
from pprint import pprint
import deepdiff
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS, SEQUENCE_WS, STRAIN_WS
from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics
from mirri.biolomics.serializers.strain import StrainMirri
from mirri.entities.publication import Publication
def retrieve_strain_by_accession_number(client, accession_number):
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
result = client.search(STRAIN_WS, query=query)
total = result["total"]
if total == 0:
return None
elif total == 1:
return result["records"][0]
else:
msg = f"More than one entries for {accession_number} in database"
raise ValueError(msg)
def get_or_create_publication(client: BiolomicsMirriClient, pub: Publication):
new_pub = client.retrieve_by_name(BIBLIOGRAPHY_WS, pub.title)
if new_pub is not None:
return {'record': new_pub, 'created': False}
new_pub = client.create(BIBLIOGRAPHY_WS, pub)
return {'record': new_pub, 'created': True}
def get_or_create_sequence(client: BiolomicsMirriClient, sequence: GenomicSequenceBiolomics):
seq = client.retrieve_by_name(SEQUENCE_WS, sequence.marker_id)
if seq is not None:
return {'record': seq, 'created': False}
new_seq = client.create(SEQUENCE_WS, sequence)
return {'record': new_seq, 'created': True}
def get_or_create_or_update_strain(client: BiolomicsMirriClient,
record: StrainMirri, update=False):
response = get_or_create_strain(client, record)
new_record = response['record']
created = response['created']
if created:
return {'record': new_record, 'created': True, 'updated': False}
if not update:
return {'record': new_record, 'created': False, 'updated': False}
if record.record_id is None:
record.record_id = new_record.record_id
if record.record_name is None:
record.record_name = new_record.record_name
if record.synonyms is None or record.synonyms == []:
record.synonyms = new_record.synonyms
# compare_strains
# we exclude pub id as it is an internal reference of pub and can be changed
diffs = deepdiff.DeepDiff(new_record.dict(), record.dict(),
ignore_order=True, exclude_paths=None,
exclude_regex_paths=[r"root\[\'publications\'\]\[\d+\]\[\'id\'\]",
r"root\[\'publications\'\]\[\d+\]\[\'RecordId\'\]",
r"root\[\'genetics\'\]\[\'Markers\'\]\[\d+\]\[\'RecordId\'\]",
r"root\[\'genetics\'\]\[\'Markers\'\]\[\d+\]\[\'RecordName\'\]"])
if diffs:
pprint(diffs, width=200)
# pprint('en el que yo mando')
# pprint(record.dict())
# pprint('lo que hay en db')
# pprint(new_record.dict())
records_are_different = True if diffs else False
if records_are_different:
updated_record = update_strain(client, record)
updated = True
else:
updated_record = record
updated = False
return {'record': updated_record, 'created': False, 'updated': updated}
def get_or_create_strain(client: BiolomicsMirriClient, strain: StrainMirri):
new_strain = retrieve_strain_by_accession_number(client, strain.id.strain_id)
if new_strain is not None:
return {'record': new_strain, 'created': False}
new_strain = create_strain(client, strain)
return {'record': new_strain, 'created': True}
def create_strain(client: BiolomicsMirriClient, strain: StrainMirri):
for pub in strain.publications:
creation_response = get_or_create_publication(client, pub)
for marker in strain.genetics.markers:
creation_response = get_or_create_sequence(client, marker)
new_strain = client.create(STRAIN_WS, strain)
return new_strain
def update_strain(client: BiolomicsMirriClient, strain: StrainMirri):
for pub in strain.publications:
creation_response = get_or_create_publication(client, pub)
for marker in strain.genetics.markers:
creation_response = get_or_create_sequence(client, marker)
new_strain = client.update(STRAIN_WS, strain)
return new_strain

View File

View File

@ -0,0 +1,210 @@
from mirri.biolomics.remote.endoint_names import (SEQUENCE_WS, STRAIN_WS,
GROWTH_MEDIUM_WS, TAXONOMY_WS,
COUNTRY_WS, ONTOBIOTOPE_WS,
BIBLIOGRAPHY_WS)
from mirri.biolomics.remote.rest_client import BiolomicsClient
from mirri.biolomics.serializers.sequence import (
serialize_to_biolomics as sequence_to_biolomics,
serialize_from_biolomics as sequence_from_biolomics)
from mirri.biolomics.serializers.strain import (
serialize_to_biolomics as strain_to_biolomics,
serialize_from_biolomics as strain_from_biolomics)
from mirri.biolomics.serializers.growth_media import (
serialize_to_biolomics as growth_medium_to_biolomics,
serialize_from_biolomics as growth_medium_from_biolomics)
from mirri.biolomics.serializers.taxonomy import (
serialize_from_biolomics as taxonomy_from_biolomics)
from mirri.biolomics.serializers.locality import (
serialize_from_biolomics as country_from_biolomics)
from mirri.biolomics.serializers.ontobiotope import (
serialize_from_biolomics as ontobiotope_from_biolomics)
from mirri.biolomics.serializers.bibliography import (
serializer_from_biolomics as bibliography_from_biolomics,
serializer_to_biolomics as bibliography_to_biolomics
)
from pprint import pprint
class BiolomicsMirriClient:
_conf = {
SEQUENCE_WS: {
'serializers': {'to': sequence_to_biolomics,
'from': sequence_from_biolomics},
'endpoint': 'WS Sequences'},
STRAIN_WS: {
'serializers': {'to': strain_to_biolomics,
'from': strain_from_biolomics},
'endpoint': 'WS Strains'},
GROWTH_MEDIUM_WS: {
'serializers': {'from': growth_medium_from_biolomics,
'to': growth_medium_to_biolomics},
'endpoint': 'WS Growth media'},
TAXONOMY_WS: {
'serializers': {'from': taxonomy_from_biolomics},
'endpoint': 'WS Taxonomy'},
COUNTRY_WS: {
'serializers': {'from': country_from_biolomics},
'endpoint': 'WS Locality'},
ONTOBIOTOPE_WS: {
'serializers': {'from': ontobiotope_from_biolomics},
'endpoint': 'WS Ontobiotope'},
BIBLIOGRAPHY_WS: {
'serializers': {'from': bibliography_from_biolomics,
'to': bibliography_to_biolomics},
'endpoint': 'WS Bibliography'
}
}
def __init__(self, server_url, api_version, client_id, client_secret, username,
password, website_id=1, verbose=False):
_client = BiolomicsClient(server_url, api_version, client_id,
client_secret, username, password,
website_id=website_id, verbose=verbose)
self.client = _client
self.schemas = self.client.get_schemas()
self.allowed_fields = self.client.allowed_fields
self._transaction_created_ids = None
self._in_transaction = False
self._verbose = verbose
def _initialize_transaction_storage(self):
if self._in_transaction:
msg = 'Can not initialize transaction if already in a transaction'
raise RuntimeError(msg)
self._transaction_created_ids = []
def _add_created_to_transaction_storage(self, response, entity_name):
if not self._in_transaction:
msg = 'Can not add ids to transaction storage if not in a transaction'
raise RuntimeError(msg)
id_ = response.json().get('RecordId', None)
if id_ is not None:
ws_endpoint_name = self._conf[entity_name]['endpoint']
self._transaction_created_ids.insert(0, (ws_endpoint_name, id_))
def start_transaction(self):
self._initialize_transaction_storage()
self._in_transaction = True
def finish_transaction(self):
self._in_transaction = False
self._transaction_created_ids = None
def get_endpoint(self, entity_name):
return self._conf[entity_name]['endpoint']
def get_serializers_to(self, entity_name):
return self._conf[entity_name]['serializers']['to']
def get_serializers_from(self, entity_name):
return self._conf[entity_name]['serializers']['from']
def retrieve_by_name(self, entity_name, name):
endpoint = self.get_endpoint(entity_name)
serializer_from = self.get_serializers_from(entity_name)
response = self.client.find_by_name(endpoint, name=name)
if response.status_code == 404:
return None
elif response.status_code != 200:
raise ValueError(f"{response.status_code}: {response.text}")
ws_entity = response.json()
return None if ws_entity is None else serializer_from(ws_entity,
client=self)
def retrieve_by_id(self, entity_name, _id):
endpoint = self.get_endpoint(entity_name)
serializer_from = self.get_serializers_from(entity_name)
response = self.client.retrieve(endpoint, record_id=_id)
if response.status_code == 404:
return None
elif response.status_code != 200:
raise ValueError(f"{response.status_code}: {response.text}")
ws_entity = response.json()
return serializer_from(ws_entity, client=self)
def create(self, entity_name, entity):
endpoint = self.get_endpoint(entity_name)
serializer_to = self.get_serializers_to(entity_name)
serializer_from = self.get_serializers_from(entity_name)
data = serializer_to(entity, client=self)
response = self.client.create(endpoint, data=data)
if response.status_code == 200:
if self._in_transaction:
self._add_created_to_transaction_storage(response, entity_name)
return serializer_from(response.json(), client=self)
else:
msg = f"return_code: {response.status_code}. msg: {response.json()['errors']['Value']}"
raise RuntimeError(msg)
def delete_by_id(self, entity_name, record_id):
endpoint = self.get_endpoint(entity_name)
response = self.client.delete(endpoint, record_id=record_id)
if response.status_code != 200:
error = response.json()
# msg = f'{error["Title"]: {error["Details"]}}'
raise RuntimeError(error)
def delete_by_name(self, entity_name, record_name):
endpoint = self.get_endpoint(entity_name)
response = self.client.find_by_name(endpoint, record_name)
if response.status_code != 200:
error = response.json()
# msg = f'{error["Title"]: {error["Details"]}}'
raise RuntimeError(error)
try:
record_id = response.json()['RecordId']
except TypeError:
raise ValueError(f'The given record_name {record_name} does not exists')
self.delete_by_id(entity_name, record_id=record_id)
def search(self, entity_name, query):
endpoint = self.get_endpoint(entity_name)
serializer_from = self.get_serializers_from(entity_name)
response = self.client.search(endpoint, search_query=query)
if response.status_code != 200:
error = response.json()
# msg = f'{error["Title"]: {error["Details"]}}'
raise RuntimeError(error)
search_result = response.json()
# pprint(search_result)
result = {'total': search_result['TotalCount'],
'records': [serializer_from(record, client=self)
for record in search_result['Records']]}
return result
def update(self, entity_name, entity):
record_id = entity.record_id
if record_id is None:
msg = 'In order to update the record, you need the recordId in the entity'
raise ValueError(msg)
endpoint = self.get_endpoint(entity_name)
serializer_to = self.get_serializers_to(entity_name)
serializer_from = self.get_serializers_from(entity_name)
data = serializer_to(entity, client=self, update=True)
# print('update')
# pprint(entity.dict())
# print(data)
# pprint(data, width=200)
response = self.client.update(endpoint, record_id=record_id, data=data)
if response.status_code == 200:
# print('receive')
# pprint(response.json())
entity = serializer_from(response.json(), client=self)
# pprint(entity.dict())
return entity
else:
msg = f"return_code: {response.status_code}. msg: {response.text}"
raise RuntimeError(msg)
def rollback(self):
self._in_transaction = False
self.client.rollback(self._transaction_created_ids)
self._transaction_created_ids = None

View File

@ -0,0 +1,7 @@
SEQUENCE_WS = 'sequence'
STRAIN_WS = 'strain'
GROWTH_MEDIUM_WS = 'growth_medium'
TAXONOMY_WS = 'taxonomy'
COUNTRY_WS = 'country'
ONTOBIOTOPE_WS = 'ontobiotope'
BIBLIOGRAPHY_WS = 'bibliography'

View File

@ -0,0 +1,214 @@
import time
import re
import sys
import requests
from requests_oauthlib import OAuth2Session
from oauthlib.oauth2 import LegacyApplicationClient
from oauthlib.oauth2.rfc6749.errors import InvalidGrantError
from mirri.entities.strain import ValidationError
class BiolomicsClient:
schemas = None
allowed_fields = None
def __init__(self, server_url, api_version, client_id, client_secret,
username, password, website_id=1, verbose=False):
self._client_id = client_id
self._client_secret = client_secret
self._username = username
self._password = password
self._client = None
self.server_url = server_url
self._api_version = api_version
self._auth_url = self.server_url + "/connect/token"
self.access_token = None
self.website_id = website_id
self._verbose = verbose
self._schema = self.get_schemas()
def get_access_token(self):
if self._client is None:
self._client = LegacyApplicationClient(client_id=self._client_id)
authenticated = False
else:
expires_at = self._client.token["expires_at"]
authenticated = expires_at > time.time()
if not authenticated:
oauth = OAuth2Session(client=self._client)
try:
token = oauth.fetch_token(
token_url=self._auth_url,
username=self._username,
password=self._password,
client_id=self._client_id,
client_secret=self._client_secret,
)
except InvalidGrantError:
oauth.close()
raise
self.access_token = token["access_token"]
oauth.close()
return self.access_token
def _build_headers(self):
self.get_access_token()
return {
"accept": "application/json",
"websiteId": str(self.website_id),
"Authorization": f"Bearer {self.access_token}",
}
def get_detail_url(self, end_point, record_id, api_version=None):
# api_version = self._api_version if api_version is None else api_version
if api_version:
return "/".join([self.server_url, api_version, 'data',
end_point, str(record_id)])
else:
return "/".join([self.server_url, 'data', end_point, str(record_id)])
def get_list_url(self, end_point):
return "/".join([self.server_url, 'data', end_point])
# return "/".join([self.server_url, self._api_version, 'data', end_point])
def get_search_url(self, end_point):
return "/".join([self.server_url, self._api_version, 'search', end_point])
def get_find_by_name_url(self, end_point):
return "/".join([self.get_search_url(end_point), 'findByName'])
def search(self, end_point, search_query):
self._check_end_point_exists(end_point)
header = self._build_headers()
url = self.get_search_url(end_point)
time0 = time.time()
response = requests.post(url, json=search_query, headers=header)
time1 = time.time()
if self._verbose:
sys.stdout.write(f'Search to {end_point} request time for {url}: {time1 - time0}\n')
return response
def retrieve(self, end_point, record_id):
self._check_end_point_exists(end_point)
header = self._build_headers()
url = self.get_detail_url(end_point, record_id, api_version=self._api_version)
time0 = time.time()
response = requests.get(url, headers=header)
time1 = time.time()
if self._verbose:
sys.stdout.write(f'Get to {end_point} request time for {url}: {time1-time0}\n')
return response
def create(self, end_point, data):
self._check_end_point_exists(end_point)
self._check_data_consistency(data, self.allowed_fields[end_point])
header = self._build_headers()
url = self.get_list_url(end_point)
return requests.post(url, json=data, headers=header)
def update(self, end_point, record_id, data):
self._check_end_point_exists(end_point)
self._check_data_consistency(data, self.allowed_fields[end_point],
update=True)
header = self._build_headers()
url = self.get_detail_url(end_point, record_id=record_id)
return requests.put(url, json=data, headers=header)
def delete(self, end_point, record_id):
self._check_end_point_exists(end_point)
header = self._build_headers()
url = self.get_detail_url(end_point, record_id)
return requests.delete(url, headers=header)
def find_by_name(self, end_point, name):
self._check_end_point_exists(end_point)
header = self._build_headers()
url = self.get_find_by_name_url(end_point)
response = requests.get(url, headers=header, params={'name': name})
return response
def get_schemas(self):
if self.schemas is None:
headers = self._build_headers()
url = self.server_url + '/schemas'
response = requests.get(url, headers=headers)
if response.status_code == 200:
self.schemas = response.json()
else:
raise ValueError(f"{response.status_code}: {response.text}")
if self.allowed_fields is None:
self.allowed_fields = self._process_schema(self.schemas)
return self.schemas
@staticmethod
def _process_schema(schemas):
schema = schemas[0]
allowed_fields = {}
for endpoint_schema in schema['TableViews']:
endpoint_name = endpoint_schema['TableViewName']
endpoint_values = endpoint_schema['ResultFields']
fields = {field['title']: field for field in endpoint_values}
allowed_fields[endpoint_name] = fields
return allowed_fields
def _check_end_point_exists(self, endpoint):
if endpoint not in self.allowed_fields.keys():
raise ValueError(f'{endpoint} not a recognised endpoint')
def _check_data_consistency(self, data, allowed_fields, update=False):
update_mandatory = set(['RecordDetails', 'RecordName', 'RecordId'])
if update and not update_mandatory.issubset(data.keys()):
msg = 'Updating data keys must be RecordDetails, RecordName and RecordId'
raise ValidationError(msg)
if not update and set(data.keys()).difference(['RecordDetails', 'RecordName', 'Acronym']):
msg = 'data keys must be RecordDetails and RecordName or Acronym'
raise ValidationError(msg)
for field_name, field_value in data['RecordDetails'].items():
if field_name not in allowed_fields:
raise ValidationError(f'{field_name} not in allowed fields')
field_schema = allowed_fields[field_name]
self._check_field_schema(field_name, field_schema, field_value)
@staticmethod
def _check_field_schema(field_name, field_schema, field_value):
if field_schema['FieldType'] != field_value['FieldType']:
msg = f"Bad FieldType ({field_value['FieldType']}) for {field_name}. "
msg += f"It should be {field_schema['FieldType']}"
raise ValidationError(msg)
states = field_schema['states'] if 'states' in field_schema else None
if states:
states = [re.sub(r" *\(.*\)", "", s) for s in states]
subfields = field_schema['subfields'] if 'subfields' in field_schema else None
if subfields is not None and states is not None:
subfield_names = [subfield['SubFieldName']
for subfield in subfields if subfield['IsUsed']]
for val in field_value['Value']:
if val['Name'] not in subfield_names:
msg = f"{field_name}: {val['Name']} not in {subfield_names}"
raise ValidationError(msg)
if val['Value'] not in states:
msg = f"{field_value['Value']} not a valid value for "
msg += f"{field_name}, Allowed values: {'. '.join(states)}"
raise ValidationError(msg)
elif states is not None:
if field_value['Value'] not in states:
msg = f"{field_value['Value']} not a valid value for "
msg += f"{field_name}, Allowed values: {'. '.join(states)}"
raise ValidationError(msg)
def rollback(self, created_ids):
for endpoint, id_ in created_ids:
try:
self.delete(end_point=endpoint, record_id=id_)
except Exception:
pass

View File

@ -0,0 +1,3 @@
RECORD_ID = 'RecordId'
RECORD_NAME = 'RecordName'
RECORD_DETAILS = 'RecordDetails'

View File

@ -0,0 +1,82 @@
from typing import List
from mirri import rgetattr
from mirri.entities.publication import Publication
from mirri.biolomics.settings import PUB_MIRRI_FIELDS
RECORD_ID = 'RecordId'
RECORD_NAME = 'RecordName'
PUB_MAPPING = {
# 'record_id': 'RecordId',
# 'record_name': 'RecordName',
'strains': "Associated strains",
'taxa': "Associated taxa",
'authors': "Authors",
# 'sequneces': "Associated sequences",
# 'abstract': "Abstract",
# 'collection': "Collection",
'doi': "DOI number",
'editor': "Editor(s)",
# 'full_reference': "Full reference",
# 'link': "Hyperlink",
'isbn': "ISBN",
'issn': "ISSN",
'issue': "Issue",
'journal': "Journal",
'journal_book': "Journal-Book",
# 'keywords': "Keywords",
'first_page': "Page from",
'last_page': "Page to",
'publisher': "Publisher",
'pubmed_id': "PubMed ID",
'volume': "Volume",
'year': "Year",
}
REV_PUB_MAPPING = {v: k for k, v in PUB_MAPPING.items()}
def serializer_from_biolomics(ws_data, client=None) -> Publication:
pub = Publication()
pub.record_id = ws_data[RECORD_ID]
pub.record_name = ws_data[RECORD_NAME]
pub.title = ws_data[RECORD_NAME]
for field, value in ws_data['RecordDetails'].items():
value = value['Value']
if not value:
continue
attr = REV_PUB_MAPPING.get(field, None)
if not attr:
continue
if attr in ('year', 'first_page', 'last_page'):
value = int(value)
setattr(pub, attr, value)
return pub
def get_publication_record_name(publication):
if publication.record_name:
return publication.record_name
if publication.title:
return publication.title
if publication.pubmed_id:
return f'PUBMED:{publication.pubmed_id}'
if publication.doi:
return f'DOI:{publication.doi}'
def serializer_to_biolomics(publication: Publication, client=None, update=False):
ws_data = {}
if publication.record_id:
ws_data[RECORD_ID] = publication.record_id
ws_data[RECORD_NAME] = get_publication_record_name(publication)
details = {}
for attr, field in PUB_MAPPING.items():
value = getattr(publication, attr, None)
if value is None:
continue
field_type = 'D' if attr == 'year' else "E"
details[field] = {'Value': value, 'FieldType': field_type}
ws_data['RecordDetails'] = details
return ws_data

View File

@ -0,0 +1,66 @@
from mirri.biolomics.serializers import RECORD_ID, RECORD_NAME, RECORD_DETAILS
from mirri.entities.growth_medium import GrowthMedium
def serialize_from_biolomics(ws_data, client=None) -> GrowthMedium:
medium = GrowthMedium()
medium.record_name = ws_data.get('RecordName', None)
medium.description = get_growth_medium_record_name(medium)
medium.record_id = ws_data.get('RecordId', None)
for key, value in ws_data['RecordDetails'].items():
value = value['Value']
if not value:
continue
if key == "Full description":
medium.full_description = value
if key == "Ingredients":
medium.ingredients = value
if key == 'Medium description':
medium.description = value
if key == 'Other name':
medium.other_name= value
if key == 'pH':
medium.ph = value
if key == 'Sterilization conditions':
medium.sterilization_conditions = value
return medium
def get_growth_medium_record_name(growth_medium):
if growth_medium.record_name:
return growth_medium.record_name
if growth_medium.description:
return growth_medium.description
if growth_medium.acronym:
return growth_medium.acronym
GROWTH_MEDIUM_MAPPING = {
'acronym': 'Acronym',
'full_description': "Full description",
'ingredients': "Ingredients",
'description': 'Medium description',
'other_name': 'Other name',
'ph': 'pH',
'sterilization_conditions': 'Sterilization conditions'
}
def serialize_to_biolomics(growth_medium: GrowthMedium, client=None, update=False):
ws_data = {}
if growth_medium.record_id:
ws_data[RECORD_ID] = growth_medium.record_id
record_name = get_growth_medium_record_name(growth_medium)
ws_data[RECORD_NAME] = record_name
details = {}
for field in growth_medium.fields:
if field in ('acronym', 'record_id', 'record_name'):
continue
value = getattr(growth_medium, field, None)
if value is not None:
details[GROWTH_MEDIUM_MAPPING[field]] = {'Value': value, 'FieldType': 'E'}
ws_data[RECORD_DETAILS] = details
return ws_data

View File

@ -0,0 +1,26 @@
from mirri.entities.location import Location
def serialize_from_biolomics(ws_data, client=None):
return ws_data
# this is a proof of concept
def serialize_location(location: Location):
fields = {}
if location.country:
fields['Country'] = {'Value': location.country, 'FieldType': 'E'}
if location.latitude and location.longitude:
value = {'Latitude': location.latitude,
'Longitude': location.longitude}
if location.coord_uncertainty:
value['Precision'] = location.coord_uncertainty
fields['GIS position'] = {'FieldType': 'L', 'Value': value}
fields['Strains'] = {"FieldType": "RLink", 'Value': [{
'Name': {'Value': None, 'FieldType': "E"},
'RecordId': None
}]}
return {"RecordDetails": fields,
"RecordName": location.country}

View File

@ -0,0 +1,2 @@
def serialize_from_biolomics(ws_data, client=None):
return ws_data

View File

@ -0,0 +1,81 @@
from mirri.entities.sequence import GenomicSequence
from mirri.biolomics.serializers import RECORD_ID, RECORD_NAME, RECORD_DETAILS
class GenomicSequenceBiolomics(GenomicSequence):
def __init__(self, **kwargs):
super().__init__(freeze=False, **kwargs)
@property
def record_id(self) -> int:
return self._data.get(RECORD_ID, None)
@record_id.setter
def record_id(self, value: int):
self._data[RECORD_ID] = value
@property
def record_name(self) -> str:
return self._data.get(RECORD_NAME, None)
@record_name.setter
def record_name(self, value: str):
self._data[RECORD_NAME] = value
def dict(self):
_data = super(GenomicSequenceBiolomics, self).dict()
if self.record_id:
_data[RECORD_ID] = self.record_id
if self.record_name:
_data[RECORD_NAME] = self.record_name
return _data
def serialize_to_biolomics(marker: GenomicSequenceBiolomics, client=None, update=False):
ws_sequence = {}
print()
if marker.record_id:
ws_sequence[RECORD_ID] = marker.record_id
if marker.record_name:
ws_sequence[RECORD_NAME] = marker.record_name
else:
ws_sequence[RECORD_NAME] = marker.marker_id
details = {}
if marker.marker_id:
details["INSDC number"] = {"Value": marker.marker_id,
"FieldType": "E"}
if marker.marker_seq:
details["DNA sequence"] = {
"Value": {"Sequence": marker.marker_seq},
"FieldType": "N"}
if marker.marker_type:
details['Marker name'] = {"Value": marker.marker_type, "FieldType": "E"}
ws_sequence[RECORD_DETAILS] = details
return ws_sequence
MAPPING_WS_SPEC_TYPES = {
'Beta tubulin': 'TUBB'
}
def serialize_from_biolomics(ws_data, client=None) -> GenomicSequenceBiolomics:
marker = GenomicSequenceBiolomics()
marker.record_id = ws_data[RECORD_ID]
marker.record_name = ws_data[RECORD_NAME]
for key, value in ws_data['RecordDetails'].items():
value = value['Value']
if key == 'INSDC number' and value:
marker.marker_id = value
elif key == 'Marker name' and value:
kind = MAPPING_WS_SPEC_TYPES.get(value, None)
value = kind if kind else value
marker.marker_type = value
elif key == 'DNA sequence' and 'Sequence' in value and value['Sequence']:
marker.marker_seq = value['Sequence']
return marker

View File

@ -0,0 +1,462 @@
import re
import sys
import pycountry
from mirri import rgetattr, rsetattr
from mirri.entities.date_range import DateRange
from mirri.entities.strain import ORG_TYPES, OrganismType, StrainId, StrainMirri, add_taxon_to_strain
from mirri.biolomics.remote.endoint_names import (GROWTH_MEDIUM_WS, TAXONOMY_WS,
ONTOBIOTOPE_WS, BIBLIOGRAPHY_WS, SEQUENCE_WS, COUNTRY_WS)
from mirri.settings import (
ALLOWED_FORMS_OF_SUPPLY,
NAGOYA_PROBABLY_SCOPE,
NAGOYA_NO_RESTRICTIONS,
NAGOYA_DOCS_AVAILABLE,
NO_RESTRICTION,
ONLY_RESEARCH,
COMMERCIAL_USE_WITH_AGREEMENT,
)
from mirri.biolomics.settings import MIRRI_FIELDS
from mirri.utils import get_pycountry
NAGOYA_TRANSLATOR = {
NAGOYA_NO_RESTRICTIONS: "no known restrictions under the Nagoya protocol",
NAGOYA_DOCS_AVAILABLE: "documents providing proof of legal access and terms of use available at the collection",
NAGOYA_PROBABLY_SCOPE: "strain probably in scope, please contact the culture collection",
}
REV_NAGOYA_TRANSLATOR = {v: k for k, v in NAGOYA_TRANSLATOR.items()}
RESTRICTION_USE_TRANSLATOR = {
NO_RESTRICTION: "no restriction apply",
ONLY_RESEARCH: "for research use only",
COMMERCIAL_USE_WITH_AGREEMENT: "for commercial development a special agreement is requested",
}
REV_RESTRICTION_USE_TRANSLATOR = {v: k for k,
v in RESTRICTION_USE_TRANSLATOR.items()}
DATE_TYPE_FIELDS = ("Date of collection", "Date of isolation",
"Date of deposit", "Date of inclusion in the catalogue")
BOOLEAN_TYPE_FIELDS = ("Strain from a registered collection", "Dual use",
"Quarantine in Europe", "Interspecific hybrid") # , 'GMO')
FILE_TYPE_FIELDS = ("MTA file", "ABS related files")
MAX_MIN_TYPE_FIELDS = ("Tested temperature growth range",
"Recommended growth temperature")
LIST_TYPES_TO_JOIN = ('Other denomination', 'Plasmids collections fields', 'Plasmids')
MARKER_TYPE_MAPPING = {
'16S rRNA': 'Sequences 16s', # or Sequences c16S rRNA
'ACT': 'Sequences ACT',
'CaM': 'Sequences CaM',
'EF-1α': 'Sequences TEF1a',
'ITS': 'Sequences ITS',
'LSU': 'Sequences LSU',
'RPB1': 'Sequences RPB1',
'RPB2': 'Sequences RPB2',
'TUBB': 'Sequences TUB' # or Sequences Beta tubulin
}
def serialize_to_biolomics(strain: StrainMirri, client=None, update=False,
log_fhand=None): # sourcery no-metrics
if log_fhand is None:
log_fhand = sys.stdout
strain_record_details = {}
for field in MIRRI_FIELDS:
try:
biolomics_field = field["biolomics"]["field"]
biolomics_type = field["biolomics"]["type"]
except KeyError:
# print(f'biolomics not configured: {field["label"]}')
continue
label = field["label"]
attribute = field["attribute"]
value = rgetattr(strain, attribute, None)
if value is None:
continue
if label == "Accession number":
value = f"{strain.id.collection} {strain.id.number}"
if label == "Restrictions on use":
value = RESTRICTION_USE_TRANSLATOR[value]
elif label == "Nagoya protocol restrictions and compliance conditions":
value = NAGOYA_TRANSLATOR[value]
elif label in FILE_TYPE_FIELDS:
value = [{"Name": "link", "Value": fname} for fname in value]
elif label == "Other culture collection numbers":
value = "; ".join(on.strain_id for on in value) if value else None
elif label in BOOLEAN_TYPE_FIELDS:
value = 'yes' if value else 'no'
elif label in 'GMO':
value = 'Yes' if value else 'No'
elif label == "Organism type":
org_types = [ot.name for ot in value]
value = []
for ot in ORG_TYPES.keys():
is_organism = "yes" if ot in org_types else "no"
value.append({"Name": ot, "Value": is_organism})
elif label == 'Taxon name':
if client:
taxa = strain.taxonomy.long_name.split(';')
value = []
for taxon_name in taxa:
taxon = get_remote_rlink(client, TAXONOMY_WS,
taxon_name)
if taxon:
value.append(taxon)
if not value:
msg = f'WARNING: {strain.taxonomy.long_name} not found in database'
log_fhand.write(msg + '\n')
# TODO: decide to raise or not if taxon not in MIRRI DB
#raise ValueError(msg)
elif label in DATE_TYPE_FIELDS:
year = value._year
month = value._month or 1
day = value._day or 1
if year is None:
continue
value = f"{year}-{month:02}-{day:02}"
elif label == 'History of deposit':
value = " < ".join(value)
elif label in MAX_MIN_TYPE_FIELDS:
if isinstance(value, (int, float, str)):
_max, _min = float(value), float(value)
else:
_max, _min = float(value['max']), float(value['min'])
content = {"MaxValue": _max, "MinValue": _min,
"FieldType": biolomics_type}
strain_record_details[biolomics_field] = content
continue
elif label in LIST_TYPES_TO_JOIN:
value = '; '.join(value)
# TODO: Check how to deal with crossrefs
elif label == "Recommended medium for growth":
if client is not None:
ref_value = []
for medium in value:
ws_gm = client.retrieve_by_name(GROWTH_MEDIUM_WS, medium)
if ws_gm is None:
raise ValueError(
f'Can not find the growth medium: {medium}')
gm = {"Name": {"Value": medium, "FieldType": "E"},
"RecordId": ws_gm.record_id}
ref_value.append(gm)
value = ref_value
else:
continue
elif label == "Form of supply":
_value = []
for form in ALLOWED_FORMS_OF_SUPPLY:
is_form = "yes" if form in value else "no"
_value.append({"Name": form, "Value": is_form})
value = _value
# print(label, value), biolomics_field
elif label == "Coordinates of geographic origin":
value = {'Latitude': strain.collect.location.latitude,
'Longitude': strain.collect.location.longitude}
precision = strain.collect.location.coord_uncertainty
if precision is not None:
value['Precision'] = precision
elif label == "Geographic origin":
if client is not None and value.country is not None:
country = get_pycountry(value.country)
if country is None:
log_fhand.write(f'WARNING: {value.country} Not a valida country code/name\n')
else:
_value = get_country_record(country, client)
if _value is None: # TODO: Remove this once the countries are added to the DB
msg = f'WARNING: {value.country} not in MIRRI DB'
log_fhand.write(msg + '\n')
#raise ValueError(msg)
else:
content = {"Value": [_value], "FieldType": "RLink"}
strain_record_details['Country'] = content
_value = []
for sector in ('state', 'municipality', 'site'):
sector_val = getattr(value, sector, None)
if sector_val:
_value.append(sector_val)
value = "; ".join(_value) if _value else None
if value is None:
continue
elif label == "Ontobiotope":
if client and value:
onto = get_remote_rlink(client, ONTOBIOTOPE_WS, value)
value = [onto] if onto is not None else None
elif label == 'Literature':
if client and value:
pub_rlinks = []
for pub in value:
rlink = get_remote_rlink(client, BIBLIOGRAPHY_WS, pub.title)
if rlink:
pub_rlinks.append(rlink)
if pub_rlinks:
value = pub_rlinks
else:
continue
elif label == '':
pass
elif label == 'Ploidy':
value = _translate_polidy(value)
if value is not None:
content = {"Value": value, "FieldType": biolomics_type}
strain_record_details[biolomics_field] = content
# if False:
# record_details["Data provided by"] = {
# "Value": strain.id.collection, "FieldType": "V"}
#Markers
if client:
add_markers_to_strain_details(client, strain, strain_record_details)
strain_structure = {"RecordDetails": strain_record_details}
if update:
strain_structure['RecordId'] = strain.record_id
strain_structure['RecordName'] = strain.record_name
else:
strain_structure["Acronym"] = "MIRRI"
return strain_structure
def add_markers_to_strain_details(client, strain: StrainMirri, details):
for marker in strain.genetics.markers:
marker_name = marker.marker_id
marker_in_ws = client.retrieve_by_name(SEQUENCE_WS, marker_name)
if marker_in_ws is None:
print('Marker not in web service')
continue
marker_type = marker.marker_type
ws_marker = {
"Value": [{
"Name": {"Value": marker_in_ws.record_name,
"FieldType": "E"},
"RecordId": marker_in_ws.record_id
}],
"FieldType": "NLink"
}
if marker_in_ws.marker_seq:
ws_marker['Value'][0]["TargetFieldValue"] = {
"Value": {"Sequence": marker_in_ws.marker_seq},
"FieldType": "N"
}
details[MARKER_TYPE_MAPPING[marker_type]] = ws_marker
def get_remote_rlink(client, endpoint, record_name):
entity = client.retrieve_by_name(endpoint, record_name)
if entity:
# some Endpoints does not serialize the json into a python object yet
try:
record_name = entity.record_name
record_id = entity.record_id
except AttributeError:
record_name = entity["RecordName"]
record_id = entity["RecordId"]
return {"Name": {"Value": record_name, "FieldType": "E"},
"RecordId": record_id}
def add_strain_rlink_to_entity(record, strain_id, strain_name):
field_strain = {
"FieldType": "RLink",
'Value': [{
'Name': {'Value': strain_name, 'FieldType': "E"},
'RecordId': strain_id
}]
}
record['RecordDetails']['Strains'] = field_strain
return record
PLOIDY_TRANSLATOR = {
0: 'Aneuploid',
1: 'Haploid',
2: 'Diploid',
3: 'Triploid',
4: 'Tetraploid',
9: 'Polyploid'
}
REV_PLOIDY_TRANSLATOR = {v: k for k, v in PLOIDY_TRANSLATOR.items()}
def _translate_polidy(ploidy):
# print('ploidy in serializer', ploidy)
try:
ploidy = int(ploidy)
except TypeError:
return '?'
try:
ploidy = PLOIDY_TRANSLATOR[ploidy]
except KeyError:
ploidy = 'Polyploid'
return ploidy
def serialize_from_biolomics(biolomics_strain, client=None): # sourcery no-metrics
strain = StrainMirri()
strain.record_id = biolomics_strain.get('RecordId', None)
strain.record_name = biolomics_strain.get('RecordName', None)
for field in MIRRI_FIELDS:
try:
biolomics_field = field["biolomics"]["field"]
except KeyError:
# print(f'biolomics not configured: {field["label"]}')
continue
label = field["label"]
attribute = field["attribute"]
field_data = biolomics_strain['RecordDetails'].get(biolomics_field, None)
if field_data is None:
continue
is_empty = field_data.get('IsEmpty')
if is_empty:
continue
if biolomics_field in ('Tested temperature growth range', 'Recommended growth temperature'):
value = {'max': field_data.get('MaxValue', None),
'min': field_data.get('MinValue', None)}
else:
value = field_data['Value']
# if value in (None, '', [], {}, '?', 'Unknown', 'nan', 'NaN'):
# continue
# print(label, attribute, biolomics_field, value)
if label == 'Accession number':
number = strain.record_name
mirri_id = StrainId(number=number)
strain.synonyms = [mirri_id]
coll, num = value.split(' ', 1)
accession_number_id = StrainId(collection=coll, number=num)
strain.id = accession_number_id
continue
elif label == "Restrictions on use":
value = REV_RESTRICTION_USE_TRANSLATOR[value]
elif label == 'Nagoya protocol restrictions and compliance conditions':
value = REV_NAGOYA_TRANSLATOR[value]
elif label in FILE_TYPE_FIELDS:
value = [f['Value'] for f in value]
elif label == "Other culture collection numbers":
other_numbers = []
for on in value.split(";"):
on = on.strip()
try:
collection, number = on.split(" ", 1)
except ValueError:
collection = None
number = on
_id = StrainId(collection=collection, number=number)
other_numbers.append(_id)
value = other_numbers
elif label in BOOLEAN_TYPE_FIELDS:
value = value == 'yes'
elif label == 'GMO':
value = value == 'Yes'
elif label == "Organism type":
organism_types = [OrganismType(item['Name']) for item in value if item['Value'] == 'yes']
if organism_types:
value = organism_types
elif label in 'Taxon name':
value = ";".join([v['Name']['Value'] for v in value])
add_taxon_to_strain(strain, value)
continue
elif label in DATE_TYPE_FIELDS:
# date_range = DateRange()
value = DateRange().strpdate(value)
elif label in ("Recommended growth temperature",
"Tested temperature growth range"):
if (value['max'] is None or value['max'] == 0 or
value['min'] is None and value['min'] == 0):
continue
elif label == "Recommended medium for growth":
value = [v['Name']['Value'] for v in value]
elif label == "Form of supply":
value = [item['Name'] for item in value if item['Value'] == 'yes']
elif label in LIST_TYPES_TO_JOIN:
value = [v.strip() for v in value.split(";")]
elif label == "Coordinates of geographic origin":
if ('Longitude' in value and 'Latitude' in value and
isinstance(value['Longitude'], float) and
isinstance(value['Latitude'], float)):
strain.collect.location.longitude = value['Longitude']
strain.collect.location.latitude = value['Latitude']
if value['Precision'] != 0:
strain.collect.location.coord_uncertainty = value['Precision']
continue
elif label == "Altitude of geographic origin":
value = float(value)
elif label == "Geographic origin":
strain.collect.location.site = value
continue
elif label == 'Ontobiotope':
try:
value = re.search("(OBT:[0-9]{5,7})", value[0]['Name']['Value']).group()
except (KeyError, IndexError, AttributeError):
continue
elif label == 'Ploidy':
value = REV_PLOIDY_TRANSLATOR[value]
elif label == 'Literature':
if client is not None:
pubs = []
for pub in value:
pub = client.retrieve_by_id(BIBLIOGRAPHY_WS, pub['RecordId'])
pubs.append(pub)
value = pubs
rsetattr(strain, attribute, value)
# fields that are not in MIRRI FIELD list
# country
if 'Country' in biolomics_strain['RecordDetails'] and biolomics_strain['RecordDetails']['Country']:
try:
country_name = biolomics_strain['RecordDetails']['Country']['Value'][0]['Name']['Value']
country = get_pycountry(country_name)
country_3 = country.alpha_3 if country else None
except (IndexError, KeyError):
country_3 = None
if country_3:
strain.collect.location.country = country_3
# Markers:
if client:
markers = []
for marker_type, biolomics_marker in MARKER_TYPE_MAPPING.items():
try:
marker_value = biolomics_strain['RecordDetails'][biolomics_marker]['Value']
except KeyError:
continue
if not marker_value:
continue
for marker in marker_value:
record_id = marker['RecordId']
marker = client.retrieve_by_id(SEQUENCE_WS, record_id)
if marker is not None:
markers.append(marker)
if markers:
strain.genetics.markers = markers
return strain
def get_country_record(country, client):
for attr in ('common_name', 'name', 'official_name'):
val = getattr(country, attr, None)
if val is not None:
_value = get_remote_rlink(client, COUNTRY_WS, val)
if _value is not None:
return _value
return None

View File

@ -0,0 +1,64 @@
from mirri.entities.strain import Taxonomy
#TODO this is all wrong, needs deep revision
class TaxonomyMirri(Taxonomy):
def __init__(self, **kwargs):
super().__init__(freeze=False, **kwargs)
fields = ['record_id', 'record_name', 'acronym', 'full_description',
'ingredients', 'description', 'other_name', 'ph',
'sterilization_conditions']
def __init__(self, **kwargs):
self._data = {}
for field in self.fields:
if field in kwargs and kwargs['field'] is not None:
value = kwargs['field']
setattr(self, field, value)
def __setattr__(self, attr, value):
if attr == '_data':
super().__setattr__(attr, value)
return
if attr not in self.fields:
raise TypeError(f'{attr} not an allowed attribute')
self._data[attr] = value
def __getattr__(self, attr):
if attr == '_data':
return super
if attr not in self.fields and attr != '_data':
raise TypeError(f'{attr} not an allowed attribute')
return self._data.get(attr, None)
def dict(self):
return self._data
def serialize_from_biolomics(ws_data, client=None) -> TaxonomyMirri:
return ws_data
medium = GrowthMedium()
medium.record_name = ws_data.get('RecordName', None)
medium.record_id = ws_data.get('RecordId', None)
for key, value in ws_data['RecordDetails'].items():
value = value['Value']
if not value:
continue
if key == "Full description":
medium.full_description = value
if key == "Ingredients":
medium.ingredients = value
if key == 'Medium description':
medium.description = value
if key == 'Other name':
medium.other_name= value
if key == 'pH':
medium.ph = value
if key == 'Sterilization conditions':
medium.sterilization_conditions = value
return medium

373
mirri/biolomics/settings.py Normal file
View File

@ -0,0 +1,373 @@
try:
from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
except ImportError:
raise ImportError(
'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD')
MIRRI_FIELDS = [
{
"attribute": "id",
"label": "Accession number",
"mandatory": True,
"biolomics": {"field": "Collection accession number", "type": "E"},
},
{
"attribute": "restriction_on_use",
"label": "Restrictions on use",
"mandatory": True,
"biolomics": {"field": "Restrictions on use", "type": "T"},
},
{
"attribute": "nagoya_protocol",
"label": "Nagoya protocol restrictions and compliance conditions",
"mandatory": True,
"biolomics": {"field": "Nagoya protocol restrictions and compliance conditions", "type": "T"},
},
{
"attribute": "abs_related_files",
"label": "ABS related files",
"mandatory": False,
"biolomics": {"field": "ABS related files", "type": "U"},
},
{
"attribute": "mta_files",
"label": "MTA file",
"mandatory": False,
"biolomics": {"field": "MTA files URL", "type": "U"},
},
{
"attribute": "other_numbers",
"label": "Other culture collection numbers",
"mandatory": False,
"biolomics": {"field": "Other culture collection numbers", "type": "E"},
},
{
"attribute": "is_from_registered_collection",
"label": "Strain from a registered collection",
"mandatory": False,
"biolomics": {"field": "Strain from a registered collection", "type": "T"},
},
{
"attribute": "risk_group",
"label": "Risk Group",
"mandatory": True,
"biolomics": {"field": "Risk group", "type": "T"},
},
{
"attribute": "is_potentially_harmful",
"label": "Dual use",
"mandatory": False,
"biolomics": {"field": "Dual use", "type": "T"},
},
{
"attribute": "is_subject_to_quarantine",
"label": "Quarantine in Europe",
"mandatory": False,
"biolomics": {"field": "Quarantine in Europe", "type": "T"},
},
{
"attribute": "taxonomy.organism_type",
"label": "Organism type",
"mandatory": True,
"biolomics": {"field": "Organism type", "type": "C"},
},
{
"attribute": "taxonomy.long_name",
"label": "Taxon name",
"mandatory": True,
"biolomics": {"field": "Taxon name", "type": "SynLink"},
},
{
"attribute": "taxonomy.infrasubspecific_name",
"label": "Infrasubspecific names",
"mandatory": False,
"biolomics": {"field": "Infrasubspecific names", "type": "E"},
},
{
"attribute": "taxonomy.comments",
"label": "Comment on taxonomy",
"mandatory": False,
"biolomics": {"field": "Comment on taxonomy", "type": "E"},
},
{
"attribute": "taxonomy.interspecific_hybrid",
"label": "Interspecific hybrid",
"mandatory": False,
"biolomics": {"field": "Interspecific hybrid", "type": "T"},
},
{
"attribute": "status", "label": "Status", "mandatory": False,
"biolomics": {"field": "Status", "type": "E"},
},
{
"attribute": "history",
"label": "History of deposit",
"mandatory": False,
"biolomics": {"field": "History", "type": "E"},
},
{
"attribute": "deposit.who",
"label": "Depositor",
"mandatory": False,
"biolomics": {"field": "Depositor", "type": "E"},
},
{
"attribute": "deposit.date",
"label": "Date of deposit",
"mandatory": False,
"biolomics": {"field": "Deposit date", "type": "H"},
},
{
"attribute": "catalog_inclusion_date",
"label": "Date of inclusion in the catalogue",
"mandatory": False,
"biolomics": {"field": "Date of inclusion in the catalogue", "type": "H"},
},
{
"attribute": "collect.who",
"label": "Collected by",
"mandatory": False,
"biolomics": {"field": "Collector", "type": "E"},
},
{
"attribute": "collect.date",
"label": "Date of collection",
"mandatory": False,
"biolomics": {"field": "Collection date", "type": "H"},
},
{
"attribute": "isolation.who",
"label": "Isolated by",
"mandatory": False,
"biolomics": {"field": "Isolator", "type": "E"},
},
{
"attribute": "isolation.date",
"label": "Date of isolation",
"mandatory": False,
"biolomics": {"field": "Isolation date", "type": "H"},
},
{
"attribute": "isolation.substrate_host_of_isolation",
"label": "Substrate/host of isolation",
"mandatory": False,
"biolomics": {"field": "Substrate of isolation", "type": "E"},
},
{
"attribute": "growth.tested_temp_range",
"label": "Tested temperature growth range",
"mandatory": False,
"biolomics": {"field": "Tested temperature growth range", "type": "S"},
},
{
"attribute": "growth.recommended_temp",
"label": "Recommended growth temperature",
"mandatory": True,
"biolomics": {"field": "Recommended growth temperature", "type": "S"},
},
{
"attribute": "growth.recommended_media",
"label": "Recommended medium for growth",
"mandatory": True,
"biolomics": {"field": "Recommended growth medium", "type": "RLink"},
},
{
"attribute": "form_of_supply",
"label": "Form of supply",
"mandatory": True,
"biolomics": {"field": "Form", "type": "C"},
},
{
"attribute": "other_denominations",
"label": "Other denomination",
"mandatory": False,
"biolomics": {"field": "Other denomination", "type": "E"},
},
{
# here we use latitude to check if there is data in some of the fields
"attribute": "collect.location.latitude",
"label": "Coordinates of geographic origin",
"mandatory": False,
"biolomics": {"field": "Coordinates of geographic origin", "type": "L"},
},
{
"attribute": "collect.location.altitude",
"label": "Altitude of geographic origin",
"mandatory": False,
"biolomics": {"field": "Altitude of geographic origin", "type": "D"},
},
{
"attribute": "collect.location",
"label": "Geographic origin",
"mandatory": True,
"biolomics": {"field": "Geographic origin", "type": "E"},
},
{
"attribute": "collect.habitat",
"label": "Isolation habitat",
"mandatory": False,
"biolomics": {"field": "Isolation habitat", "type": "E"},
},
# {
# "attribute": "collect.habitat_ontobiotope",
# "label": "Ontobiotope term for the isolation habitat",
# "mandatory": False,
# "biolomics": {"field": "Ontobiotope term for the isolation habitat", "type": "E"},
# },
{
"attribute": "collect.habitat_ontobiotope",
"label": "Ontobiotope",
"mandatory": False,
"biolomics": {"field": "Ontobiotope", "type": "RLink"},
},
{
"attribute": "genetics.gmo", "label": "GMO", "mandatory": False,
"biolomics": {"field": "GMO", "type": "V"},
},
{
"attribute": "genetics.gmo_construction",
"label": "GMO construction information",
"mandatory": False,
"biolomics": {"field": "GMO construction information", "type": "E"},
},
{
"attribute": "genetics.mutant_info",
"label": "Mutant information",
"mandatory": False,
"biolomics": {"field": "Mutant information", "type": "E"},
},
{
"attribute": "genetics.genotype",
"label": "Genotype",
"mandatory": False,
"biolomics": {"field": "Genotype", "type": "E"},
},
{
"attribute": "genetics.sexual_state",
"label": "Sexual state",
"mandatory": False,
"biolomics": {"field": "Sexual state", "type": "E"},
},
{
"attribute": "genetics.ploidy",
"label": "Ploidy",
"mandatory": False,
"biolomics": {"field": "Ploidy", "type": "T"},
},
{
"attribute": "genetics.plasmids",
"label": "Plasmids",
"mandatory": False,
"biolomics": {"field": "Plasmids", "type": "E"},
},
{
"attribute": "genetics.plasmids_in_collections",
"label": "Plasmids collections fields",
"mandatory": False,
"biolomics": {"field": "Plasmids collections fields", "type": "E"},
},
{
"attribute": "publications",
"label": "Literature",
"mandatory": False,
"biolomics": {"field": "Literature", "type": "RLink"},
},
{
"attribute": "pathogenicity",
"label": "Pathogenicity",
"mandatory": False,
"biolomics": {"field": "Pathogenicity", "type": "E"},
},
{
"attribute": "enzyme_production",
"label": "Enzyme production",
"mandatory": False,
"biolomics": {"field": "Enzyme production", "type": "E"},
},
{
"attribute": "production_of_metabolites",
"label": "Production of metabolites",
"mandatory": False,
"biolomics": {"field": "Metabolites production", "type": "E"},
},
{
"attribute": "applications",
"label": "Applications",
"mandatory": False,
"biolomics": {"field": "Applications", "type": "E"},
},
{
"attribute": "remarks", "label": "Remarks", "mandatory": False,
"biolomics": {"field": "Remarks", "type": "E"},
},
{
"attribute": "literature_linked_to_the_sequence_genome",
"label": "Literature linked to the sequence/genome",
"mandatory": False,
# "biolomics": {"field": "MTA files URL", "type": "U"},
},
]
PUB_MIRRI_FIELDS = [
{
"attribute": "pub_id", "mandatory": False,
"biolomics": {"field": "", "type": "E"},
},
{
"attribute": "pubmed_id", "mandatory": False,
"biolomics": {"field": "PubMed ID", "type": "E"},
},
{
"attribute": "doi", "mandatory": False,
"biolomics": {"field": "DOI number", "type": "E"},
},
{
"attribute": "title", "mandatory": False,
"biolomics": {"field": "Title", "type": "E"},
},
{
"attribute": "authors", "mandatory": False,
"biolomics": {"field": "Authors", "type": "E"},
},
{
"attribute": "journal", "mandatory": False,
"biolomics": {"field": "Journal", "type": "E"},
},
{
"attribute": "volumen", "mandatory": False,
"biolomics": {"field": "Volume", "type": "E"},
},
{
"attribute": "issue", "mandatory": False,
"biolomics": {"field": "Issue", "type": "E"},
},
{
"attribute": "first_page", "mandatory": False,
"biolomics": {"field": "Page from", "type": "E"},
},
{
"attribute": "last_page", "mandatory": False,
"biolomics": {"field": "Page to", "type": "E"},
},
{
"attribute": "last_page", "label": "", "mandatory": False,
"biolomics": {"field": "", "type": "E"},
},
{
"attribute": "last_page", "label": "", "mandatory": False,
"biolomics": {"field": "", "type": "E"},
},
{
"attribute": "book_title", "label": "", "mandatory": False,
"biolomics": {"field": "Book title", "type": "E"},
},
{
"attribute": "publisher", "label": "", "mandatory": False,
"biolomics": {"field": "Publisher", "type": "E"},
},
{
"attribute": "editor", "label": "", "mandatory": False,
"biolomics": {"field": "Editor(s)", "type": "E"},
},
]

3603
mirri/data/ontobiotopes.csv Normal file

File diff suppressed because it is too large Load Diff

View File

View File

@ -0,0 +1,45 @@
class FrozenClass(object):
__isfrozen = False
def __setattr__(self, key, value):
# print(dir(self))
if self.__isfrozen and not hasattr(self, key):
msg = f"Can not add {key} to {self.__class__.__name__}. It is not one of its attributes"
raise TypeError(msg)
object.__setattr__(self, key, value)
def _freeze(self):
self.__isfrozen = True
class _FieldBasedClass(FrozenClass):
_fields = []
def __init__(self, data=None, freeze=True):
self._data = {}
if data is None:
data = {}
for field in self._fields:
value = data.get(field["label"], None)
setattr(self, field["attribute"], value)
if freeze:
self._freeze()
def __eq__(self, o: object) -> bool:
for field in self._fields:
val1 = getattr(self, field["attribute"], None)
val2 = getattr(o, field["attribute"], None)
if val1 != val2:
return False
return True
def __bool__(self):
return bool(self.dict())
def dict(self):
data = {}
for field in self._fields:
value = getattr(self, field["attribute"])
if value is not None:
data[field["label"]] = value
return data

View File

@ -0,0 +1,87 @@
from calendar import monthrange
from collections import OrderedDict
from copy import copy
from datetime import date
class DateRange:
def __init__(self, year=None, month=None, day=None):
self._year = year
if month is not None and (month < 1 or month > 12):
raise ValueError("Month must be between 1 and 12")
self._month = month
if day is not None and (day < 1 or day > 31):
raise ValueError("Day must be between 1 and 31")
self._day = day
self._start = None
self._end = None
if year or month or day:
self._create_range()
def __str__(self):
_strdate = self.strfdate
if _strdate is None:
return ""
return _strdate
def __bool__(self):
return bool(self._year or self._month or self._day)
def _create_range(self):
year = self._year
month = self._month
day = self._day
if year and month and day:
start_date = date(year=year, month=month, day=day)
end_date = date(year=year, month=month, day=day)
elif month is None:
start_date = date(year=year, month=1, day=1)
end_date = date(year=year, month=12, day=31)
elif day is None:
month_last_day = monthrange(year, month)[1]
start_date = date(year=year, month=month, day=1)
end_date = date(year=year, month=month, day=month_last_day)
self._start = start_date
self._end = end_date
def strpdate(self, date_str: str):
date_str = str(date_str)
orig_date = copy(date_str)
date_str = date_str.replace("/", "").replace("-", "")
if len(date_str) > 8:
msg = f"Malformed date, Mora caracters than expected: {orig_date}"
raise ValueError(msg)
month = None
day = None
if len(date_str) >= 4:
year = int(date_str[:4])
if len(date_str) >= 6:
month = int(date_str[4:6])
if month < 1 or month > 12:
raise ValueError("Month must be between 1 and 12")
if len(date_str) >= 8:
day = int(date_str[6:8])
if day is not None and (day < 1 or day > 31):
raise ValueError("Day must be between 1 and 31")
self._year = year
self._month = month
self._day = day
self._create_range()
return self
@property
def strfdate(self):
year = "----" if self._year is None else f"{self._start.year:04}"
month = "--" if self._month is None else f"{self._start.month:02}"
day = "--" if self._day is None else f"{self._start.day:02}"
_date = str(f"{year}{month}{day}")
if _date == "--------":
return None
return _date
@property
def range(self):
return OrderedDict([("start", self._start), ("end", self._end)])

View File

@ -0,0 +1,47 @@
class GrowthMedium:
fields = ['record_id', 'record_name', 'acronym', 'full_description',
'ingredients', 'description', 'other_name', 'ph',
'sterilization_conditions']
def __init__(self, **kwargs):
self._data = {}
for field in self.fields:
if field in kwargs and kwargs['field'] is not None:
value = kwargs['field']
setattr(self, field, value)
def __setattr__(self, attr, value):
if attr == '_data':
super().__setattr__(attr, value)
return
if attr not in self.fields:
raise TypeError(f'{attr} not an allowed attribute')
self._data[attr] = value
def __getattr__(self, attr):
if attr == '_data':
return super
if attr not in self.fields and attr != '_data':
raise TypeError(f'{attr} not an allowed attribute')
return self._data.get(attr, None)
def dict(self):
return self._data
def update(self, growth_media, include_fields=None):
for field in self.fields:
if include_fields and field in include_fields:
new_value = getattr(growth_media, field, None)
actual_value = getattr(self, field, None)
if new_value is not None and new_value != actual_value:
setattr(self, field, new_value)
def is_equal(self, other, exclude_fields=[]):
for field in self.fields:
if field in exclude_fields:
continue
value_of_other = getattr(other, field, None)
value_of_self = getattr(self, field, None)
if value_of_self is not None and value_of_self != value_of_other:
return False
return True

170
mirri/entities/location.py Normal file
View File

@ -0,0 +1,170 @@
from __future__ import annotations
import hashlib
from typing import Union
from mirri.entities._private_classes import _FieldBasedClass
from mirri.settings import (
ALTITUDE,
COORD_SPATIAL_REFERENCE,
COORDUNCERTAINTY,
COUNTRY,
GEOREF_METHOD,
ISLAND,
LATITUDE,
LONGITUDE,
MUNICIPALITY,
OTHER,
PROVINCE,
SITE,
STATE,
)
import pycountry
class Location(_FieldBasedClass):
_fields = [
{"attribute": "country", "label": COUNTRY},
{"attribute": "state", "label": STATE},
{"attribute": "province", "label": PROVINCE},
{"attribute": "municipality", "label": MUNICIPALITY},
{"attribute": "site", "label": SITE},
{"attribute": "other", "label": OTHER},
{"attribute": "island", "label": ISLAND},
{"attribute": "longitude", "label": LONGITUDE},
{"attribute": "latitude", "label": LATITUDE},
{"attribute": "altitude", "label": ALTITUDE},
{"attribute": "coord_spatial_reference", "label": COORD_SPATIAL_REFERENCE},
{"attribute": "coord_uncertainty", "label": COORDUNCERTAINTY},
{"attribute": "georef_method", "label": GEOREF_METHOD},
]
def __str__(self):
_site = []
if self.country:
_site.append(self.country)
if self.province:
_site.append(self.province)
if self.site:
_site.append(self.site)
if self.municipality:
_site.append(self.municipality)
return ": ".join(_site)
def __hash__(self):
hash_str = ''
for field in self._fields:
value = str(getattr(self, field, None))
hash_str += value
# hash_str = str(self.country) + str(self.province) + str(self.municipality) + str(self.site)
return int(hashlib.sha1(hash_str.encode("utf-8")).hexdigest(), 16) % (10 ** 8)
@property
def country(self) -> Union[str, None]:
return self._data.get(COUNTRY, None)
@country.setter
def country(self, code3: str):
if code3 is not None:
_country = pycountry.countries.get(alpha_3=code3)
if _country is None:
_country = pycountry.historic_countries.get(alpha_3=code3)
if _country is None and code3 != 'INW':
raise ValueError(f'{code3}, not a valid 3 letter country name')
self._data[COUNTRY] = code3
@property
def province(self) -> Union[str, None]:
return self._data.get(PROVINCE, None)
@province.setter
def province(self, code3: str):
self._data[PROVINCE] = code3
@property
def municipality(self) -> Union[str, None]:
return self._data.get(MUNICIPALITY, None)
@municipality.setter
def municipality(self, name: str):
self._data[MUNICIPALITY] = name
@property
def site(self) -> Union[str, None]:
return self._data.get(SITE, None)
@site.setter
def site(self, name: str):
self._data[SITE] = name
@property
def latitude(self):
return self._data.get(LATITUDE, None)
@latitude.setter
def latitude(self, latitude: float):
self._data[LATITUDE] = latitude
@property
def longitude(self) -> Union[float, None]:
return self._data.get(LONGITUDE, None)
@longitude.setter
def longitude(self, longitude: float):
self._data[LONGITUDE] = longitude
@property
def altitude(self) -> Union[int, float, None]:
return self._data.get(ALTITUDE, None)
@altitude.setter
def altitude(self, altitude: Union[int, float]):
self._data[ALTITUDE] = altitude
@property
def georef_method(self) -> Union[str, None]:
return self._data.get(GEOREF_METHOD, None)
@georef_method.setter
def georef_method(self, georef_method: str):
self._data[GEOREF_METHOD] = georef_method
@property
def coord_uncertainty(self) -> Union[str, None]:
return self._data.get(COORDUNCERTAINTY, None)
@coord_uncertainty.setter
def coord_uncertainty(self, coord_uncertainty: str):
self._data[COORDUNCERTAINTY] = coord_uncertainty
@property
def coord_spatial_reference(self) -> Union[str, None]:
return self._data.get(COORD_SPATIAL_REFERENCE, None)
@coord_spatial_reference.setter
def coord_spatial_reference(self, coord_spatial_reference: str):
self._data[COORD_SPATIAL_REFERENCE] = coord_spatial_reference
@property
def state(self) -> Union[str, None]:
return self._data.get(STATE, None)
@state.setter
def state(self, state):
self._data[STATE] = state
@property
def island(self) -> Union[str, None]:
return self._data.get(ISLAND, None)
@island.setter
def island(self, island):
self._data[ISLAND] = island
@property
def other(self) -> Union[str, None]:
return self._data.get(OTHER, None)
@other.setter
def other(self, other):
self._data[OTHER] = other

View File

@ -0,0 +1,202 @@
from mirri.settings import (BOOK_EDITOR, BOOK_PUBLISHER, BOOK_TITLE,
PUB_AUTHORS, PUB_DOI, PUB_FIRST_PAGE, PUB_ID,
PUB_ISSUE, PUB_JOURNAL, PUB_LAST_PAGE,
PUB_PUBMED_ID, PUB_TITLE, PUB_VOLUME)
# Maybe we could implement some crossref calls to fill all field data
# and get DOI where ever is possible
RECORD_ID = 'RecordId'
RECORD_NAME = 'RecordName'
class Publication:
def __init__(self, data=None):
self._data = {}
if data:
self.record_id = data.get('RecordId', None)
self.record_name = data.get('RecordName', None)
self.pubmed_id = data.get(PUB_PUBMED_ID, None)
self.doi = data.get(PUB_DOI, None)
self.title = data.get(PUB_TITLE, None)
self.authors = data.get(PUB_AUTHORS, None)
self.journal = data.get(PUB_JOURNAL, None)
self.volume = data.get(PUB_VOLUME, None)
self.issue = data.get(PUB_ISSUE, None)
self.first_page = data.get(PUB_FIRST_PAGE, None)
self.last_page = data.get(PUB_LAST_PAGE, None)
self.editor = data.get(BOOK_EDITOR, None)
self.publisher = data.get(BOOK_PUBLISHER, None)
self.book_title = data.get(BOOK_TITLE, None)
self.isbn = data.get('ISBN', None)
self.issn = data.get('ISSN', None)
self.year = data.get('Year', None)
def __bool__(self):
return bool(self._data)
def dict(self):
return self._data
@property
def id(self) -> int:
return self._data.get(PUB_ID, None)
@id.setter
def id(self, value: int):
if value is not None:
self._data[PUB_ID] = value
@property
def record_id(self) -> int:
return self._data.get(RECORD_ID, None)
@record_id.setter
def record_id(self, value: int):
if value is not None:
self._data[RECORD_ID] = value
@property
def record_name(self) -> int:
return self._data.get(RECORD_NAME, None)
@record_name.setter
def record_name(self, value: int):
if value is not None:
self._data[RECORD_NAME] = value
@property
def pubmed_id(self):
return self._data.get(PUB_PUBMED_ID, None)
@pubmed_id.setter
def pubmed_id(self, value: str):
if value is not None:
self._data[PUB_PUBMED_ID] = value
@property
def isbn(self):
return self._data.get('ISBN', None)
@isbn.setter
def isbn(self, value: str):
if value is not None:
self._data['ISBN'] = value
@property
def issn(self):
return self._data.get('ISSN', None)
@issn.setter
def issn(self, value: str):
if value is not None:
self._data['ISSN'] = value
@property
def doi(self):
return self._data.get(PUB_DOI, None)
@doi.setter
def doi(self, value: str):
if value is not None:
self._data[PUB_DOI] = value
@property
def title(self):
return self._data.get(PUB_TITLE, None)
@title.setter
def title(self, value: str):
if value is not None:
self._data[PUB_TITLE] = value
self._data[RECORD_NAME] = value
@property
def authors(self):
return self._data.get(PUB_AUTHORS, None)
@authors.setter
def authors(self, value: str):
if value is not None:
self._data[PUB_AUTHORS] = value
@property
def journal(self):
return self._data.get(PUB_JOURNAL, None)
@journal.setter
def journal(self, value: str):
if value is not None:
self._data[PUB_JOURNAL] = value
@property
def volume(self):
return self._data.get(PUB_VOLUME, None)
@volume.setter
def volume(self, value: str):
if value is not None:
self._data[PUB_VOLUME] = value
@property
def issue(self):
return self._data.get(PUB_ISSUE, None)
@issue.setter
def issue(self, value: str):
if value is not None:
self._data[PUB_ISSUE] = value
@property
def first_page(self):
return self._data.get(PUB_FIRST_PAGE, None)
@first_page.setter
def first_page(self, value: str):
if value is not None:
self._data[PUB_FIRST_PAGE] = value
@property
def last_page(self):
return self._data.get(PUB_LAST_PAGE, None)
@last_page.setter
def last_page(self, value: str):
if value is not None:
self._data[PUB_LAST_PAGE] = value
@property
def book_title(self):
return self._data.get(BOOK_TITLE, None)
@book_title.setter
def book_title(self, value: str):
if value is not None:
self._data[BOOK_TITLE] = value
@property
def editors(self):
return self._data.get(BOOK_EDITOR, None)
@editors.setter
def editors(self, value: str):
if value is not None:
self._data[BOOK_EDITOR] = value
@property
def publisher(self):
return self._data.get(BOOK_PUBLISHER, None)
@publisher.setter
def publisher(self, value: str):
if value is not None:
self._data[BOOK_PUBLISHER] = value
@property
def year(self) -> int:
return self._data.get('Year', None)
@year.setter
def year(self, value: int):
if value is not None:
self._data['Year'] = value

View File

@ -0,0 +1,45 @@
from mirri.entities._private_classes import _FieldBasedClass
from mirri.settings import (
ALLOWED_MARKER_TYPES,
MARKER_INSDC,
MARKER_SEQ,
MARKER_TYPE)
from mirri import ValidationError
class GenomicSequence(_FieldBasedClass):
_fields = [
{"attribute": "marker_type", "label": MARKER_TYPE},
{"attribute": "marker_id", "label": MARKER_INSDC},
{"attribute": "marker_seq", "label": MARKER_SEQ},
]
@property
def marker_type(self):
return self._data.get(MARKER_TYPE, None)
@marker_type.setter
def marker_type(self, value: str):
if value is not None:
types = " ".join([m["acronym"] for m in ALLOWED_MARKER_TYPES])
if value not in types:
msg = f"{value} not in allowed marker types: {types}"
raise ValidationError(msg)
self._data[MARKER_TYPE] = value
@property
def marker_id(self) -> str:
return self._data.get(MARKER_INSDC, None)
@marker_id.setter
def marker_id(self, value: str):
self._data[MARKER_INSDC] = value
@property
def marker_seq(self) -> str:
return self._data.get(MARKER_SEQ, None)
@marker_seq.setter
def marker_seq(self, value: str):
self._data[MARKER_SEQ] = value

1243
mirri/entities/strain.py Normal file

File diff suppressed because it is too large Load Diff

0
mirri/io/__init__.py Normal file
View File

View File

79
mirri/io/parsers/excel.py Normal file
View File

@ -0,0 +1,79 @@
from io import BytesIO
from openpyxl import load_workbook
def excel_dict_reader(fhand, sheet_name, mandatory_column_name=None):
fhand.seek(0)
wb = load_workbook(filename=BytesIO(fhand.read()), data_only=True,
read_only=True)
return workbook_sheet_reader(wb, sheet_name, mandatory_column_name=mandatory_column_name)
def is_none(value):
return value is None
def workbook_sheet_reader(workbook, sheet_name, mandatory_column_name=None,
allowed_empty_line_slots=5):
try:
sheet = workbook[sheet_name]
except KeyError as error:
raise ValueError(f"The '{sheet_name}' sheet is missing.") from error
first = True
header = []
empty_lines = 0
for row in sheet.rows:
values = []
for cell in row:
if cell.value is not None and cell.data_type == 's':
value = str(cell.value).strip()
else:
value = cell.value
values.append(value)
# values = [cell.value.strip() for cell in row]
if first:
header = values
first = False
continue
if not any(values):
empty_lines += 1
if empty_lines >= allowed_empty_line_slots:
break
continue
empty_lines = 0
data = dict(zip(header, values))
if mandatory_column_name is not None and not data[mandatory_column_name]:
# msg = f"Exiting before end of sheet {sheet_name} ends.\n"
# msg += f"Mandatory column ({mandatory_column_name}) empty. \n"
# msg += "Check file for empty lines"
# print(msg)
continue
yield data
def get_all_cell_data_from_sheet(workbook, sheet_name, allowed_empty_line_slots=5):
try:
sheet = workbook[sheet_name]
except KeyError as error:
raise ValueError(f"The '{sheet_name}' sheet is missing.") from error
empty_lines = 0
all_values = []
for row in sheet.rows:
values = []
for cell in row:
if cell.value is not None and cell.data_type == 's':
value = str(cell.value).strip()
else:
value = cell.value
values.append(value)
if not any(values):
empty_lines += 1
if empty_lines >= allowed_empty_line_slots:
break
continue
empty_lines = 0
all_values.extend(values)
return all_values

View File

@ -0,0 +1,276 @@
import re
from datetime import date
from io import BytesIO
import pycountry
from openpyxl import load_workbook
from mirri import rsetattr, ValidationError
from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics
from mirri.biolomics.serializers.strain import StrainMirri
from mirri.entities.growth_medium import GrowthMedium
from mirri.io.parsers.excel import workbook_sheet_reader
from mirri.entities.publication import Publication
from mirri.entities.date_range import DateRange
from mirri.entities.strain import OrganismType, StrainId, add_taxon_to_strain
from mirri.settings import (COMMERCIAL_USE_WITH_AGREEMENT, GENOMIC_INFO,
GROWTH_MEDIA, LITERATURE_SHEET, LOCATIONS,
MIRRI_FIELDS, NAGOYA_DOCS_AVAILABLE, NAGOYA_NO_RESTRICTIONS,
NAGOYA_PROBABLY_SCOPE, NO_RESTRICTION,
ONLY_RESEARCH, ONTOBIOTOPE,
PUBLICATION_FIELDS, STRAINS, SUBTAXAS)
from mirri.utils import get_country_from_name
RESTRICTION_USE_TRANSLATOR = {
1: NO_RESTRICTION,
2: ONLY_RESEARCH,
3: COMMERCIAL_USE_WITH_AGREEMENT,
}
NAGOYA_TRANSLATOR = {
1: NAGOYA_NO_RESTRICTIONS,
2: NAGOYA_DOCS_AVAILABLE,
3: NAGOYA_PROBABLY_SCOPE,
}
TRUEFALSE_TRANSLATOR = {
1: False,
2: True
}
def parse_mirri_excel(fhand, version="20200601"):
if version == "20200601":
return _parse_mirri_v20200601(fhand)
else:
raise NotImplementedError("Only version 20200601 is implemented")
def _parse_mirri_v20200601(fhand):
fhand.seek(0)
file_content = BytesIO(fhand.read())
wb = load_workbook(filename=file_content, read_only=True, data_only=True)
locations = workbook_sheet_reader(wb, LOCATIONS)
ontobiotopes = workbook_sheet_reader(wb, ONTOBIOTOPE)
growth_media = list(parse_growth_media(wb))
markers = workbook_sheet_reader(wb, GENOMIC_INFO)
publications = list(parse_publications(wb))
strains = parse_strains(wb, locations=locations, growth_media=growth_media,
markers=markers, publications=publications,
ontobiotopes=ontobiotopes)
return {"strains": strains, "growth_media": growth_media}
def index_list_by(list_, id_):
return {str(item[id_]): item for item in list_}
def index_list_by_attr(list_, id_):
return {str(getattr(item, id_)): item for item in list_}
def index_markers(markers):
indexed_markers = {}
for marker in markers:
strain_id = marker["Strain AN"]
if strain_id not in indexed_markers:
indexed_markers[strain_id] = []
indexed_markers[strain_id].append(marker)
return indexed_markers
def remove_hard_lines(string=None):
if string is not None and string != '':
return re.sub(r'\r+\n+|\t+', '', string).strip()
else:
return None
def parse_growth_media(wb):
for row in workbook_sheet_reader(wb, GROWTH_MEDIA):
gm = GrowthMedium()
gm.acronym = str(row['Acronym'])
gm.description = row['Description']
gm.full_description = remove_hard_lines(row.get('Full description', None))
yield gm
def parse_publications(wb):
ids = []
for row in workbook_sheet_reader(wb, LITERATURE_SHEET):
pub = Publication()
for pub_field in PUBLICATION_FIELDS:
label = pub_field["label"]
col_val = row.get(label, None)
if col_val:
attribute = pub_field["attribute"]
setattr(pub, attribute, col_val)
yield pub
def parse_strains(wb, locations, growth_media, markers, publications,
ontobiotopes):
ontobiotopes_by_id = {str(ont["ID"]): ont['Name'] for ont in ontobiotopes}
ontobiotopes_by_name = {v: k for k, v in ontobiotopes_by_id.items()}
locations = index_list_by(locations, 'Locality')
growth_media = index_list_by_attr(growth_media, 'acronym')
publications = index_list_by_attr(publications, 'id')
markers = index_markers(markers)
for strain_row in workbook_sheet_reader(wb, STRAINS, "Accession number"):
strain = StrainMirri()
strain_id = None
label = None
for field in MIRRI_FIELDS:
label = field["label"]
attribute = field["attribute"]
value = strain_row[label]
if value is None or value == '':
continue
if attribute == "id":
collection, number = value.split(" ", 1)
value = StrainId(collection=collection, number=number)
rsetattr(strain, attribute, value)
elif attribute == "restriction_on_use":
rsetattr(strain, attribute, RESTRICTION_USE_TRANSLATOR[value])
elif attribute == "nagoya_protocol":
rsetattr(strain, attribute, NAGOYA_TRANSLATOR[value])
elif attribute == "other_numbers":
other_numbers = []
for on in value.split(";"):
on = on.strip()
try:
collection, number = on.split(" ", 1)
except ValueError:
collection = None
number = on
_id = StrainId(collection=collection, number=number)
other_numbers.append(_id)
rsetattr(strain, attribute, other_numbers)
elif attribute == "taxonomy.taxon_name":
try:
add_taxon_to_strain(strain, value)
except ValueError:
msg = f"The '{label}' for strain with Accession Number {strain_id} is not according to the specification."
raise ValidationError(msg)
elif attribute == "taxonomy.organism_type":
value = [OrganismType(val.strip())
for val in str(value).split(";")]
rsetattr(strain, attribute, value)
elif attribute in ("deposit.date", "collect.date", "isolation.date",
"catalog_inclusion_date"):
if isinstance(value, date):
value = DateRange(
year=value.year, month=value.month, day=value.day
)
elif isinstance(value, str):
value = DateRange().strpdate(value)
else:
raise NotImplementedError()
rsetattr(strain, attribute, value)
elif attribute == 'growth.recommended_temp':
temps = value.split(';')
if len(temps) == 1:
_min, _max = float(temps[0]), float(temps[0])
else:
_min, _max = float(temps[0]), float(temps[1])
rsetattr(strain, attribute, {'min': _min, 'max': _max})
elif attribute == "growth.recommended_media":
sep = "/"
if ";" in value:
sep = ";"
growth_media = [v.strip() for v in value.split(sep)]
rsetattr(strain, attribute, growth_media)
elif attribute == 'growth.tested_temp_range':
if value:
min_, max_ = value.split(";")
value = {'min': float(min_), 'max': float(max_)}
rsetattr(strain, attribute, value)
elif attribute == "form_of_supply":
rsetattr(strain, attribute, value.split(";"))
elif attribute == "collect.location.coords":
items = value.split(";")
strain.collect.location.latitude = float(items[0])
strain.collect.location.longitude = float(items[1])
if len(items) > 2:
strain.collect.location.coord_uncertainty = items[2]
elif attribute == "collect.location":
location = locations[value]
if 'Country' in location and location['Country']:
if location['Country'] == 'Unknown':
continue
country_3 = _get_country_alpha3(location['Country'])
strain.collect.location.country = country_3
strain.collect.location.state = location["Region"]
strain.collect.location.municipality = location["City"]
strain.collect.location.site = location["Locality"]
elif attribute in ("abs_related_files", "mta_files"):
rsetattr(strain, attribute, value.split(";"))
elif attribute in ("is_from_registered_collection",
"is_subject_to_quarantine", 'taxonomy.interspecific_hybrid',
"is_potentially_harmful", "genetics.gmo"):
rsetattr(strain, attribute, TRUEFALSE_TRANSLATOR[value])
elif attribute == "publications":
value = str(value)
pubs = []
pub_ids = [v.strip() for v in str(value).split(";")]
for pub_id in pub_ids:
pub = publications.get(pub_id, None)
if pub is None:
pub = Publication()
if '/' in pub_id:
pub.doi = pub_id
else:
pub.pubmed_id = pub_id
pubs.append(pub)
rsetattr(strain, attribute, pubs)
elif attribute == 'ontobiotope':
values = []
for val in value.split(';'):
if val not in ontobiotopes_by_id:
val = ontobiotopes_by_name[val]
values.append(val)
rsetattr(strain, attribute, value)
elif attribute == 'other_denominations':
value = [v.strip() for v in value.split(';')]
rsetattr(strain, attribute, value)
elif attribute == 'genetics.plasmids':
value = [v.strip() for v in value.split(';')]
rsetattr(strain, attribute, value)
else:
#print(attribute, value, type(value))
rsetattr(strain, attribute, value)
# add markers
strain_id = strain.id.strain_id
if strain_id in markers:
for marker in markers[strain_id]:
_marker = GenomicSequenceBiolomics()
_marker.marker_id = marker["INSDC AN"]
_marker.marker_type = marker["Marker"]
_marker.marker_seq = marker["Sequence"]
strain.genetics.markers.append(_marker)
yield strain
def _get_country_alpha3(loc_country):
if loc_country == 'INW':
return loc_country
country = get_country_from_name(loc_country)
if not country:
country = pycountry.countries.get(alpha_3=loc_country)
if not country:
country = pycountry.historic_countries.get(alpha_3=loc_country)
country_3 = country.alpha_3
return country_3

View File

View File

@ -0,0 +1,305 @@
import csv
from copy import deepcopy
from openpyxl.workbook.workbook import Workbook
from mirri import rgetattr
from mirri.settings import GROWTH_MEDIA, MIRRI_FIELDS, DATA_DIR, PUBLICATION_FIELDS
from mirri.io.parsers.mirri_excel import NAGOYA_TRANSLATOR, RESTRICTION_USE_TRANSLATOR
INITIAL_SEXUAL_STATES = [
"Mata",
"Matalpha",
"Mata/Matalpha",
"Mata",
"Matb",
"Mata/Matb",
"MTLa",
"MTLalpha",
"MTLa/MTLalpha",
"MAT1-1",
"MAT1-2",
"MAT1",
"MAT2",
"MT+",
"MT-",
"MT+",
"MT-",
"H+",
"H-",
]
MARKER_FIELDS = [
{"attribute": "acronym", "label": "Acronym", "mandatory": True},
{"attribute": "marker", "label": "Marker", "mandatory": True},
]
MARKER_DATA = [
{"acronym": "16S rRNA", "marker": "16S rRNA"},
{"acronym": "ACT", "marker": "Actin"},
{"acronym": "CaM", "marker": "Calmodulin"},
{"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"},
{"acronym": "ITS", "marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"},
{"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"},
{"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"},
{"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"},
{"acronym": "TUBB", "marker": "β-Tubulin"},
]
REV_RESTRICTION_USE_TRANSLATOR = {v: k for k, v in RESTRICTION_USE_TRANSLATOR.items()}
REV_NAGOYA_TRANSLATOR = {v: k for k, v in NAGOYA_TRANSLATOR.items()}
PUB_HEADERS = [pb["label"] for pb in PUBLICATION_FIELDS]
def write_mirri_excel(path, strains, growth_media, version):
if version == "20200601":
_write_mirri_excel_20200601(path, strains, growth_media)
def _write_mirri_excel_20200601(path, strains, growth_media):
wb = Workbook()
write_markers_sheet(wb)
ontobiotope_path = DATA_DIR / "ontobiotopes.csv"
write_ontobiotopes(wb, ontobiotope_path)
write_growth_media(wb, growth_media)
growth_media_indexes = [str(gm.acronym) for gm in growth_media]
locations = {}
publications = {}
sexual_states = set(deepcopy(INITIAL_SEXUAL_STATES))
genomic_markers = {}
strains_data = _deserialize_strains(strains, locations, growth_media_indexes,
publications, sexual_states, genomic_markers)
strains_data = list(strains_data)
# write strain to generate indexed data
strain_sheet = wb.create_sheet("Strains")
strain_sheet.append([field["label"] for field in MIRRI_FIELDS])
for strain_row in strains_data:
strain_sheet.append(strain_row)
redimension_cell_width(strain_sheet)
# write locations
loc_sheet = wb.create_sheet("Geographic origin")
loc_sheet.append(["ID", "Country", "Region", "City", "Locality"])
for index, loc_index in enumerate(locations.keys()):
location = locations[loc_index]
row = [index, location.country, location.state, location.municipality,
loc_index]
loc_sheet.append(row)
redimension_cell_width(loc_sheet)
# write publications
pub_sheet = wb.create_sheet("Literature")
pub_sheet.append(PUB_HEADERS)
for publication in publications.values():
row = []
for pub_field in PUBLICATION_FIELDS:
# if pub_field['attribute'] == 'id':
# value = index
value = getattr(publication, pub_field['attribute'], None)
row.append(value)
pub_sheet.append(row)
redimension_cell_width(pub_sheet)
# write sexual states
sex_sheet = wb.create_sheet("Sexual states")
for sex_state in sorted(list(sexual_states)):
sex_sheet.append([sex_state])
redimension_cell_width(sex_sheet)
# write genetic markers
markers_sheet = wb.create_sheet("Genomic information")
markers_sheet.append(['Strain AN', 'Marker', 'INSDC AN', 'Sequence'])
for strain_id, markers in genomic_markers.items():
for marker in markers:
row = [strain_id, marker.marker_type, marker.marker_id, marker.marker_seq]
markers_sheet.append(row)
redimension_cell_width(markers_sheet)
del wb["Sheet"]
wb.save(str(path))
def _deserialize_strains(strains, locations, growth_media_indexes,
publications, sexual_states, genomic_markers):
for strain in strains:
strain_row = []
for field in MIRRI_FIELDS:
attribute = field["attribute"]
if attribute == "id":
value = strain.id.strain_id
elif attribute == "restriction_on_use":
value = rgetattr(strain, attribute)
if value is not None:
value = REV_RESTRICTION_USE_TRANSLATOR[value]
elif attribute == "nagoya_protocol":
value = rgetattr(strain, attribute)
if value:
value = REV_NAGOYA_TRANSLATOR[value]
elif attribute == "other_numbers":
value = rgetattr(strain, attribute)
if value is not None:
value = [f"{on.collection} {on.number}" for on in value]
value = "; ".join(value)
elif attribute == 'other_denominations':
od = strain.other_denominations
value = "; ".join(od) if od else None
elif attribute in (
"is_from_registered_collection",
"is_subject_to_quarantine",
"is_potentially_harmful",
"genetics.gmo",
"taxonomy.interspecific_hybrid"
):
value = rgetattr(strain, attribute)
if value is True:
value = 2
elif value is False:
value = 1
else:
value = None
elif attribute == "taxonomy.taxon_name":
value = strain.taxonomy.long_name
elif attribute in ("deposit.date", "collect.date", "isolation.date",
'catalog_inclusion_date'):
value = rgetattr(strain, attribute)
value = value.strfdate if value else None
elif attribute == "growth.recommended_media":
value = rgetattr(strain, attribute)
if value is not None:
for gm in value:
gm = str(gm)
if gm not in growth_media_indexes:
print(gm, growth_media_indexes)
msg = f"Growth media {gm} not in the provided ones"
continue
raise ValueError(msg)
value = "/".join(value)
elif attribute in ('growth.tested_temp_range',
"growth.recommended_temp"):
value = rgetattr(strain, attribute)
if value:
value = f'{value["min"]}; {value["max"]}'
elif attribute == "form_of_supply":
value = rgetattr(strain, attribute)
value = ";".join(value)
elif attribute == "collect.location.coords":
lat = strain.collect.location.latitude
long = strain.collect.location.longitude
if lat is not None and long is not None:
value = f"{lat};{long}"
else:
value = None
elif attribute == "collect.location":
location = strain.collect.location
loc_index = _build_location_index(location)
if loc_index is None:
continue
if loc_index not in locations:
locations[loc_index] = location
value = loc_index
elif attribute in ("abs_related_files", "mta_files"):
value = rgetattr(strain, attribute)
value = ";".join(value) if value else None
elif attribute == "taxonomy.organism_type":
value = rgetattr(strain, attribute)
if value:
value = "; ".join([str(v.code) for v in value])
elif attribute == "history":
value = rgetattr(strain, attribute)
if value is not None:
value = " < ".join(value)
elif attribute == "genetics.sexual_state":
value = rgetattr(strain, attribute)
if value:
sexual_states.add(value)
elif attribute == "genetics.ploidy":
value = rgetattr(strain, attribute)
elif attribute == "taxonomy.organism_type":
organism_types = rgetattr(strain, attribute)
if organism_types is not None:
value = [org_type.code for org_type in organism_types]
value = ";".join(value)
elif attribute == 'publications':
value = []
for pub in strain.publications:
value.append(pub.id)
if pub.id not in publications:
publications[pub.id] = pub
value = ';'.join(str(v) for v in value) if value else None
elif attribute == 'genetics.plasmids':
value = rgetattr(strain, attribute)
if value is not None:
value = ';'.join(value)
else:
value = rgetattr(strain, attribute)
strain_row.append(value)
genomic_markers[strain.id.strain_id] = strain.genetics.markers
yield strain_row
def _build_location_index(location):
index = []
if location.country:
index.append(location.country)
if location.site:
index.append(location.site)
return ';'.join(index) if index else None
def write_markers_sheet(wb):
sheet = wb.create_sheet("Markers")
_write_work_sheet(
sheet,
labels=[f["label"] for f in MARKER_FIELDS],
attributes=[f["attribute"] for f in MARKER_FIELDS],
data=MARKER_DATA,
)
redimension_cell_width(sheet)
def write_ontobiotopes(workbook, ontobiotype_path):
ws = workbook.create_sheet("Ontobiotope")
with ontobiotype_path.open() as fhand:
for row in csv.reader(fhand, delimiter="\t"):
ws.append(row)
redimension_cell_width(ws)
def _write_work_sheet(sheet, labels, attributes, data):
sheet.append(labels)
for row in data:
row_data = [row[field] for field in attributes]
sheet.append(row_data)
redimension_cell_width(sheet)
def write_growth_media(wb, growth_media):
ws = wb.create_sheet(GROWTH_MEDIA)
ws.append(["Acronym", "Description", "Full description"])
for growth_medium in growth_media:
row = [
growth_medium.acronym,
growth_medium.description,
growth_medium.full_description,
]
ws.append(row)
redimension_cell_width(ws)
def redimension_cell_width(ws):
dims = {}
for row in ws.rows:
for cell in row:
if cell.value:
max_ = max((dims.get(cell.column_letter, 0), len(str(cell.value))))
dims[cell.column_letter] = max_
for col, value in dims.items():
ws.column_dimensions[col].width = value

296
mirri/settings.py Normal file
View File

@ -0,0 +1,296 @@
from pathlib import Path
DATA_DIR = Path(__file__).parent / "data"
ACCESSION_NUMBER = "accession_number"
RESTRICTION_ON_USE = "restriction_on_use"
NAGOYA_PROTOCOL = "nagoya_protocol"
ABS_RELATED_FILES = "abs_related_files"
MTA_FILES = "mta_file"
OTHER_CULTURE_NUMBERS = "other_culture_collection_numbers"
STRAIN_FROM_REGISTERED_COLLECTION = "strain_from_a_registered_collection"
RISK_GROUP = "risk_group"
DUAL_USE = "dual_use"
QUARANTINE = "quarantine"
ORGANISM_TYPE = "organism_type"
TAXON_NAME = "taxon_name"
INFRASUBSPECIFIC_NAME = "infrasubspecific_names"
COMMENTS_ON_TAXONOMY = "comments_on_taxonomy"
STATUS = "status"
HISTORY_OF_DEPOSIT = "history_of_deposit"
DEPOSITOR = "depositor"
DATE_OF_DEPOSIT = "date_of_deposit"
COLLECTED_BY = "collected_by"
DATE_OF_COLLECTION = "date_of_collection"
ISOLATED_BY = "isolated_by"
DATE_OF_ISOLATION = "date_of_isolation"
DATE_OF_INCLUSION = "date_of_inclusion_on_catalog"
TESTED_TEMPERATURE_GROWTH_RANGE = "tested_temperature_growth_range"
RECOMMENDED_GROWTH_TEMP = "recommended_growth_temperature"
RECOMMENDED_GROWTH_MEDIUM = "recommended_media_for_growth"
FORM_OF_SUPPLY = "form_of_supply"
GEO_COORDS = "coordinates_of_geographic_origin"
ACCESSION_NAME = "other_denomination"
ALTITUDE = "altitude_of_geographic_origin"
GEOGRAPHIC_ORIGIN = "geographic_origin"
GMO = "gmo"
GMO_CONSTRUCTION_INFO = "gmo_construction_information"
MUTANT_INFORMATION = "mutant_information"
GENOTYPE = "genotype"
LITERATURE = "literature"
SEXUAL_STATE = "sexual_state"
PLOIDY = "ploidy"
INTERSPECIFIC_HYBRID = "interspecific_hybrid"
HYBRIDS = 'hybrids'
PLANT_PATHOGENICITY_CODE = "plant_pathogenicity_code"
PATHOGENICITY = "pathogenicity"
ENZYME_PRODUCTION = "enzyme_production"
PRODUCTION_OF_METABOLITES = "production_of_metabolites"
APPLICATIONS = "applications"
REMARKS = "remarks"
PLASMIDS = "plasmids"
PLASMIDS_COLLECTION_FIELDS = "plasmids_collections_fields"
SUBSTRATE_HOST_OF_ISOLATION = "substrate_host_of_isolation"
ISOLATION_HABITAT = "isolation_habitat"
ONTOBIOTOPE_ISOLATION_HABITAT = "ontobiotope_term_for_the_isolation_habitat"
LITERATURE_LINKED_TO_SEQ_GENOME = "literature_linked_to_the_sequence_genome"
# StrainId
STRAIN_ID = "id"
COLLECTION_CODE = "collection_code"
STRAIN_PUI = "strain_pui"
STRAIN_URL = "strain_url"
ID_SYNONYMS = 'id_synonyms'
# Taxonomy
GENUS = "genus"
SPECIES = "species"
# Location
COUNTRY = "countryOfOriginCode"
SITE = "site"
STATE = "state"
PROVINCE = "province"
MUNICIPALITY = "municipality"
ISLAND = "island"
OTHER = "other"
LATITUDE = "latitude"
LONGITUDE = "longitude"
ALTITUDE = "altitude"
GEOREF_METHOD = "georeferencingMethod"
COORDUNCERTAINTY = "coordUncertainty"
COORD_SPATIAL_REFERENCE = "coordenatesSpatialReference"
LOCATION = "location"
ALLOWED_COLLECTING_SITE_KEYS = [
COUNTRY,
STATE,
PROVINCE,
ISLAND,
MUNICIPALITY,
OTHER,
SITE,
LATITUDE,
LONGITUDE,
ALTITUDE,
GEOREF_METHOD,
COORDUNCERTAINTY,
COORD_SPATIAL_REFERENCE,
]
MIRRI_FIELDS = [
{"attribute": "id", "label": "Accession number"},
{"attribute": "restriction_on_use", "label": "Restrictions on use"},
{"attribute": "nagoya_protocol",
"label": "Nagoya protocol restrictions and compliance conditions"},
{"attribute": ABS_RELATED_FILES, "label": "ABS related files"},
{"attribute": "mta_files", "label": "MTA file"},
{"attribute": "other_numbers", "label": "Other culture collection numbers"},
{"attribute": "is_from_registered_collection",
"label": "Strain from a registered collection"},
{"attribute": "risk_group", "label": "Risk Group"},
{"attribute": "is_potentially_harmful", "label": "Dual use"},
{"attribute": "is_subject_to_quarantine", "label": "Quarantine in Europe"},
{"attribute": "taxonomy.organism_type", "label": "Organism type"},
{"attribute": "taxonomy.taxon_name", "label": "Taxon name"},
{"attribute": "taxonomy.infrasubspecific_name",
"label": "Infrasubspecific names"},
{"attribute": "taxonomy.comments", "label": "Comment on taxonomy"},
{"attribute": "taxonomy.interspecific_hybrid",
"label": "Interspecific hybrid"},
{"attribute": "status", "label": "Status"},
{"attribute": "history", "label": "History of deposit", },
{"attribute": "deposit.who", "label": "Depositor"},
{"attribute": "deposit.date", "label": "Date of deposit"},
{"attribute": "catalog_inclusion_date",
"label": "Date of inclusion in the catalogue"},
{"attribute": "collect.who", "label": "Collected by"},
{"attribute": "collect.date", "label": "Date of collection"},
{"attribute": "isolation.who", "label": "Isolated by"},
{"attribute": "isolation.date", "label": "Date of isolation"},
{"attribute": "isolation.substrate_host_of_isolation",
"label": "Substrate/host of isolation"},
{"attribute": "growth.tested_temp_range",
"label": "Tested temperature growth range"},
{"attribute": "growth.recommended_temp",
"label": "Recommended growth temperature"},
{"attribute": "growth.recommended_media",
"label": "Recommended medium for growth"},
{"attribute": "form_of_supply", "label": "Form of supply"},
{"attribute": "other_denominations", "label": "Other denomination"},
{"attribute": "collect.location.coords",
"label": "Coordinates of geographic origin"},
{"attribute": "collect.location.altitude",
"label": "Altitude of geographic origin"},
{"attribute": "collect.location", "label": "Geographic origin"},
{"attribute": "collect.habitat", "label": "Isolation habitat"},
{"attribute": "collect.habitat_ontobiotope",
"label": "Ontobiotope term for the isolation habitat"},
{"attribute": "genetics.gmo", "label": "GMO"},
{"attribute": "genetics.gmo_construction",
"label": "GMO construction information"},
{"attribute": "genetics.mutant_info", "label": "Mutant information"},
{"attribute": "genetics.genotype", "label": "Genotype"},
{"attribute": "genetics.sexual_state", "label": "Sexual state"},
{"attribute": "genetics.ploidy", "label": "Ploidy"},
{"attribute": "genetics.plasmids", "label": "Plasmids"},
{"attribute": "genetics.plasmids_in_collections",
"label": "Plasmids collections fields"},
{"attribute": "publications", "label": "Literature"},
{"attribute": PLANT_PATHOGENICITY_CODE, "label": "Plant pathogenicity code"},
{"attribute": "pathogenicity", "label": "Pathogenicity"},
{"attribute": "enzyme_production", "label": "Enzyme production"},
{"attribute": "production_of_metabolites",
"label": "Production of metabolites"},
{"attribute": "applications", "label": "Applications", },
{"attribute": "remarks", "label": "Remarks"},
{"attribute": LITERATURE_LINKED_TO_SEQ_GENOME,
"label": "Literature linked to the sequence/genome"},
]
ALLOWED_SUBTAXA = ["subspecies", "variety", "convarietas", "group", "forma",
'forma.specialis']
ALLOWED_TAXONOMIC_RANKS = ["family", "genus", "species"] + ALLOWED_SUBTAXA
# nagoya
NAGOYA_NO_RESTRICTIONS = "no_known_restrictions_under_the_Nagoya_protocol"
NAGOYA_DOCS_AVAILABLE = "documents_providing_proof_of_legal_access_and_terms_of_use_available_at_the_collection"
NAGOYA_PROBABLY_SCOPE = "strain_probably_in_scope,_please_contact_the_culture_collection"
ALLOWED_NAGOYA_OPTIONS = [NAGOYA_NO_RESTRICTIONS,
NAGOYA_DOCS_AVAILABLE, NAGOYA_PROBABLY_SCOPE]
# Use restriction
NO_RESTRICTION = "no_restriction"
ONLY_RESEARCH = "only_research"
COMMERCIAL_USE_WITH_AGREEMENT = "commercial_use_with_agreement"
ALLOWED_RESTRICTION_USE_OPTIONS = [
NO_RESTRICTION,
ONLY_RESEARCH,
COMMERCIAL_USE_WITH_AGREEMENT,
]
ALLOWED_RISK_GROUPS = ["1", "2", "3", "4"]
AGAR = "Agar"
CRYO = "Cryo"
DRY_ICE = "Dry Ice"
LIQUID_CULTURE_MEDIUM = "Liquid Culture Medium"
LYO = "Lyo"
OIL = "Oil"
WATER = "Water"
ALLOWED_FORMS_OF_SUPPLY = [AGAR, CRYO, DRY_ICE,
LIQUID_CULTURE_MEDIUM, LYO, OIL, WATER]
DEPOSIT = "deposit"
ISOLATION = "isolation"
COLLECT = "collect"
GROWTH = "growth"
GENETICS = "genetics"
TAXONOMY = "taxonomy"
# Markers
MARKERS = "markers"
MARKER_TYPE = "marker_type"
MARKER_INSDC = "INSDC"
MARKER_SEQ = "marker_seq"
ALLOWED_MARKER_TYPES = [
{"acronym": "16S rRNA", "marker": "16S rRNA"},
{"acronym": "ACT", "marker": "Actin"},
{"acronym": "CaM", "marker": "Calmodulin"},
{"acronym": "EF-1α", "marker": "elongation factor 1-alpha (EF-1α)"},
{"acronym": "ITS",
"marker": "nuclear ribosomal Internal Transcribed Spacer (ITS)"},
{"acronym": "LSU", "marker": "nuclear ribosomal Large SubUnit (LSU)"},
{"acronym": "RPB1", "marker": "Ribosomal RNA-coding genes RPB1"},
{"acronym": "RPB2", "marker": "Ribosomal RNA-coding genes RPB2"},
{"acronym": "TUBB", "marker": "β-Tubulin"},
]
PUBLICATIONS = "publications"
PUB_ID = "id"
PUB_DOI = "pub_doi"
PUB_PUBMED_ID = ''
PUB_FULL_REFERENCE = "full_reference"
PUB_TITLE = "title"
PUB_AUTHORS = "authors"
PUB_JOURNAL = "journal"
PUB_YEAR = "year"
PUB_VOLUME = "volume"
PUB_ISSUE = "issue"
PUB_FIRST_PAGE = "first_page"
PUB_LAST_PAGE = "last_page"
BOOK_TITLE = "book_title"
BOOK_EDITOR = "book_editor"
BOOK_PUBLISHER = "book_publisher"
PUBLICATION_FIELDS = [
{"label": "ID", "attribute": PUB_ID},
{"label": "Full reference", "attribute": PUB_FULL_REFERENCE},
{"label": "Authors", "attribute": PUB_AUTHORS},
{"label": "Title", "attribute": PUB_TITLE},
{"label": "Journal", "attribute": PUB_JOURNAL},
{"label": "Year", "attribute": PUB_YEAR},
{"label": "Volume", "attribute": PUB_VOLUME},
{"label": "Issue", "attribute": PUB_ISSUE},
{"label": "First page", "attribute": PUB_FIRST_PAGE},
{"label": "Last page", "attribute": PUB_FIRST_PAGE},
{"label": "Book title", "attribute": BOOK_TITLE},
{"label": "Editors", "attribute": BOOK_EDITOR},
{"label": "Publisher", "attribute": BOOK_PUBLISHER},
]
# ploidy
ANEUPLOID = 0
HAPLOID = 1
DIPLOID = 2
TRIPLOID = 3
TETRAPLOID = 4
POLYPLOID = 9
ALLOWED_PLOIDIES = [ANEUPLOID, HAPLOID, DIPLOID, TRIPLOID, TETRAPLOID,
POLYPLOID]
SUBTAXAS = {
"subsp.": "subspecies",
"var.": "variety",
"convar.": "convarietas",
"group.": "group",
"f.": "forma",
"f.sp.": "forma.specialis"
}
# Excel sheet name
LOCATIONS = "Geographic origin" # 'Locations'
GROWTH_MEDIA = "Growth media"
GENOMIC_INFO = "Genomic information"
STRAINS = "Strains"
LITERATURE_SHEET = "Literature"
SEXUAL_STATE_SHEET = "Sexual states"
RESOURCE_TYPES_VALUES = "Resource types values"
FORM_OF_SUPPLY_SHEET = "Forms of supply"
PLOIDY_SHEET = "Ploidy"
ONTOBIOTOPE = "Ontobiotope"
MARKERS = "Markers"

48
mirri/utils.py Normal file
View File

@ -0,0 +1,48 @@
import pycountry
class FakeCountry:
def __init__(self, name=None, code3=None):
self.code3 = code3
self.name = name
def get_pycountry(value):
if value == 'INW':
return FakeCountry(name='International Water', code3='INW')
country = get_country_from_name(value)
if country is None:
country = get_country_from_alpha3(value)
return country
def get_country_from_name(name):
country = pycountry.countries.get(name=name)
try:
if country is None:
country = pycountry.countries.get(common_name=name)
if country is None:
country = pycountry.countries.get(official_name=name)
if country is None:
country = pycountry.historic_countries.get(name=name)
if country is None:
country = pycountry.historic_countries.get(common_name=name)
if country is None:
country = pycountry.historic_countries.get(official_name=name)
except (AttributeError, KeyError):
country = None
return country
def get_country_from_alpha3(code):
country = pycountry.countries.get(alpha_3=code)
try:
if country is None:
country = pycountry.historic_countries.get(alpha_3=code)
except (AttributeError, KeyError):
country = None
return country

View File

View File

@ -0,0 +1,50 @@
from mirri import rgetattr
def validate_strain(strain, version='20200601'):
if version == '20200601':
return _validate_strain_v20200601(strain)
raise NotImplementedError('Only v20200601 is implemented')
def _validate_strain_v20200601(strain):
mandatory_attrs = [{'label': 'Accession Number', 'attr': 'id.strain_id'},
{'label': 'Nagoya protocol', 'attr': 'nagoya_protocol'},
{'label': 'Restriction on use', 'attr': 'restriction_on_use'},
{'label': 'Risk group', 'attr': 'risk_group'},
{'label': 'Organism type', 'attr': 'taxonomy.organism_type'},
{'label': 'Taxon name', 'attr': 'taxonomy.long_name'},
{'label': 'Recommended temperature to growth', 'attr': 'growth.recommended_temp'},
{'label': 'Recommended media', 'attr': 'growth.recommended_media'},
{'label': 'Form of supply', 'attr': 'form_of_supply'},
{'label': 'Country', 'attr': 'collect.location.country'}]
errors = []
for mandatory in mandatory_attrs:
value = rgetattr(strain, mandatory['attr'])
if value is None:
errors.append(f"{mandatory['label']} is mandatory field")
if not is_valid_nagoya(strain):
errors.append('Not compliant wih nagoya protocol requirements')
return errors
def is_valid_nagoya(strain):
# nagoya_requirements
_date = strain.collect.date
if _date is None:
_date = strain.isolation.date
if _date is None:
_date = strain.deposit.date
if _date is None:
_date = strain.catalog_inclusion_date
# print(_date)
year = None if _date is None else _date._year
if year is not None and year >= 2014 and strain.collect.location.country is None:
return False
return True

View File

@ -0,0 +1,3 @@
from .error import Entity, Error
from .error_message import ErrorMessage
from .error_log import ErrorLog

View File

@ -0,0 +1,119 @@
from typing import Optional
from .error_message import ErrorMessage
class Entity():
"""Entity information
Args:
acronym: acronym of the entity. Must be a 3-characters captalized string
"""
def __init__(self, acronym: str) -> None:
self.acronym = acronym
def __str__(self) -> str:
return f"Entity {self.acronym}: {self.name}"
@property
def _acronyms(self) -> list:
return [
func
for func in dir(self)
if func.isupper() and
callable(getattr(self, func)) and
not func.startswith("__")
]
@property
def _names(self) -> dict:
return {acr: getattr(self, acr)() for acr in self._acronyms}
@property
def name(self) -> str:
try:
return self._names[self.acronym]
except KeyError:
raise KeyError(f'Unknown acronym {self.acronym}.')
@property
def acronym(self) -> str:
return self._acronym
@acronym.setter
def acronym(self, acronym: str) -> None:
self._acronym = acronym
def EFS(self) -> str:
return 'Excel File Structure'
def GMD(self) -> str:
return 'Growth Media'
def GOD(self) -> str:
return 'Geographic Origin'
def LID(self) -> str:
return 'Literature'
def STD(self) -> str:
return 'Strains'
def GID(self) -> str:
return 'Genomic Information'
def OTD(self) -> str:
return 'Ontobiotope'
def UCT(self) -> str:
return 'Uncategorized'
class Error():
"""Error information
Args:
message (str): Error message
entity (Entity, optional): Entity related to the error. If None will default to Uncategorized. Defaults to None.
data (str, optional): Data used for sorting the messages. Defaults to None.
"""
def __init__(self, code: str, pk: Optional[str] = None, data: Optional[str] = None) -> None:
self.code = code.upper()
self.pk = pk
self.data = data
def __str__(self):
return f"Error {self._code}: {self.message}"
@property
def code(self) -> str:
return self._code
@code.setter
def code(self, code: str) -> None:
self._code = code.upper()
@property
def pk(self) -> Optional[str]:
return self._pk
@pk.setter
def pk(self, pk: Optional[str] = None) -> None:
self._pk = pk
@property
def data(self) -> Optional[str]:
return self._data
@data.setter
def data(self, data: Optional[str]):
self._data = data
@property
def entity(self) -> Entity:
return Entity(self.code[:3])
@property
def message(self) -> str:
return ErrorMessage(self.code, self.pk, self.data).message

View File

@ -0,0 +1,77 @@
from typing import Optional, Union
from datetime import datetime
from .error import Error
class ErrorLog():
def __init__(self, input_filename: str, cc: Optional[str] = None, date: Optional[Union[str, datetime]] = None, limit: int = 100):
"""
Logger for Error instances.
Args:
input_filename (str): name of the file to be logged
cc (str, optional): name of the curator. Defaults to None.
date (str, optional): date (e.g. created, last modified) associated with the file. Useful for versioning. Defaults to None.
limit (int, optional): limit of errors to print to the report. Defaults to 100.
"""
self._input_filename = input_filename
self._cc = cc
self._date = date
self._errors = {}
self.limit = limit
self._counter = 0
def __str__(self) -> str:
output = f"""Error Log for file {self._input_filename}\nENTITY | CODE | MESSAGE"""
for acronym, error_list in self.get_errors().items():
for error in error_list:
output += f"\n{acronym:6} | {error.code:6} | {error.message[:100]}"
return output
@property
def input_filename(self) -> str:
return self._input_filename
@input_filename.setter
def input_filename(self, input_filename: str) -> None:
self._input_filename = input_filename
@property
def cc(self) -> Optional[str]:
return self._cc
@cc.setter
def cc(self, cc: Optional[str]) -> None:
self._cc = cc
@property
def date(self) -> Optional[Union[str, datetime]]:
return self._date
@date.setter
def date(self, date: Optional[Union[str, datetime]] = None) -> None:
if isinstance(date, str):
self._date = datetime.strptime(date, r'%d-%m-%Y')
else:
self._date = date
def get_errors(self) -> dict:
"""
Get all errors
Returns:
dict: Error intances grouped by entity acronym.
"""
return self._errors
def add_error(self, error: Error) -> None:
"""
Add an error.
Args:
error (Error): Error instance.
"""
if error.entity.acronym not in self._errors:
self._errors[error.entity.acronym] = [error]
else:
self._errors[error.entity.acronym].append(error)

View File

@ -0,0 +1,408 @@
from typing import Optional
class ErrorMessage():
"""Error message
Args:
code (str): Error code.
pk (str | optional): The instance's primary key that triggered the error. Defaults to None.
value (str | optional): The instance's value that triggered the error. Defaults to None.
"""
def __init__(self, code: str, pk: Optional[str] = None, value: Optional[str] = None):
self.code = code.upper()
self.pk = pk
self.value = value
@property
def _codes(self) -> list:
return [
func
for func in dir(self)
if func.isupper() and
callable(getattr(self, func)) and
not func.startswith("__")
]
@property
def _messages(self) -> dict:
return {code: getattr(self, code) for code in self._codes}
@property
def message(self) -> str:
if not self._validate_code():
raise ValueError(f"{self.code} not found")
return self._messages[self.code]()
@property
def code(self) -> str:
return self._code
@code.setter
def code(self, code: str) -> None:
self._code = code.upper()
def _validate_code(self) -> bool:
return self.code in self._codes
@property
def pk(self) -> str:
return self._pk
@pk.setter
def pk(self, pk: str) -> None:
self._pk = pk
@property
def value(self) -> str:
return self._value
@value.setter
def value(self, value: str) -> None:
self._value = value
"""
Excel File Structure Error Codes
"""
def EXL00(self):
return f"The provided file '{self.pk}' is not an excel(xlsx) file"
def EFS01(self):
return "The 'Growth media' sheet is missing. Please check the provided excel template."
def EFS02(self):
return "The 'Geographic origin' sheet is missing. Please check the provided excel template."
def EFS03(self):
return "The 'Literature' sheet is missing. Please check the provided excel template."
def EFS04(self):
return "The 'Sexual state' sheet is missing. Please check the provided excel template."
def EFS05(self):
return "The 'Strains' sheet is missing. Please check the provided excel template."
def EFS06(self):
return "The 'Ontobiotope' sheet is missing. Please check the provided excel template."
def EFS07(self):
return "The 'Markers' sheet is missing. Please check the provided excel template."
def EFS08(self):
return "The 'Genomic information' sheet is missing. Please check the provided excel template."
"""
Growth Media Error Codes
"""
def GMD01(self):
return "The 'Acronym' column is a mandatory field in the Growth Media sheet."
def GMD02(self):
return "The 'Acronym' column is empty or has missing values."
def GMD03(self):
return "The 'Description' column is a mandatory field in the Growth Media sheet. The column can not be empty."
def GMD04(self):
return f"The 'Description' for growth media with Acronym {self.pk} is missing."
"""
Geographic Origin Error Codes
"""
def GOD01(self):
return "The 'ID' column is a mandatory field in the Geographic Origin sheet."
def GOD02(self):
return "The 'ID' column is empty or has missing values."
def GOD03(self):
return "The 'Country' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD04(self):
return f"The 'Country' for geographic origin with ID {self.pk} is missing."
def GOD05(self):
return f"The 'Country' for geographic origin with ID {self.pk} is incorrect."
def GOD06(self):
return f"The 'Locality' column is a mandatory field in the Geographic Origin sheet. The column can not be empty."
def GOD07(self):
return f"The 'Locality' for geographic origin with ID {self.pk} is missing."
"""
Literature Error Codes
"""
def LID01(self):
return "The 'ID' column is a mandatory field in the Literature sheet."
def LID02(self):
return "The 'ID' column empty or missing values."
def LID03(self):
return "The 'Full reference' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID04(self):
return f"The 'Full reference' for literature with ID {self.pk} is missing."
def LID05(self):
return "The 'Authors' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID06(self):
return f"The 'Authors' for literature with ID {self.pk} is missing."
def LID07(self):
return "The 'Title' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID08(self):
return f"The 'Title' for literature with ID {self.pk} is missing."
def LID09(self):
return "The 'Journal' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID10(self):
return f"The 'Journal' for literature with ID {self.pk} is missing."
def LID11(self):
return "The 'Year' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID12(self,):
return f"The 'Year' for literature with ID {self.pk} is missing."
def LID13(self):
return "The 'Volume' column is a mandatory field in the Literature sheet. The column can not be empty."
def LID14(self):
return f"The 'Volume' for literature with ID {self.pk} is missing."
def LID15(self):
return "The 'First page' column is a mandatory field. The column can not be empty."
def LID16(self):
return f"The 'First page' for literature with ID {self.pk} is missing."
def LID17(self):
msg = 'If journal; Title, Authors, journal, year and first page are required'
msg += 'If Book; Book Title, Authors, Year, Editors, Publishers'
return msg
"""
Strains Error Codes
"""
def STD01(self):
return "The 'Accession number' column is a mandatory field in the Strains sheet."
def STD02(self):
return "The 'Accession number' column is empty or has missing values."
def STD03(self):
return f"The 'Accesion number' must be unique. The '{self.value}' is repeated."
def STD04(self):
return (f"The 'Accession number' {self.pk} is not according to the specification."
" The value must be of the format '<Sequence of characters> <sequence of characters>'.")
def STD05(self):
return f"The 'Restriction on use' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD06(self):
return f"The 'Restriction on use' for strain with Accession Number {self.pk} is missing."
def STD07(self):
return (f"The 'Restriction on use' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD08(self):
return f"The 'Nagoya protocol restrictions and compliance conditions' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD09(self):
return f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is missing."
def STD10(self):
return (f"The 'Nagoya protocol restrictions and compliance conditions' for strain with Accession Number {self.pk} is not according to the specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD11(self):
return (f"The 'Strain from a registered collection' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3.")
def STD12(self):
return "The 'Risk group' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD13(self):
return f"The 'Risk group' for strain with Accession Number {self.pk} is missing."
def STD14(self):
return (f"The 'Risk group' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2, 3, 4.")
def STD15(self):
return (f"The 'Dual use' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD16(self):
return (f"The “Quarantine in europe” for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD17(self):
return f"The 'Organism type' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD18(self):
return f"The 'Organism type' for strain with Accession Number {self.pk} is missing."
def STD19(self):
return (f"The 'Organism type' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Algae', 'Archaea', 'Bacteria', 'Cyanobacteria', "
"'Filamentous Fungi', 'Phage', 'Plasmid', 'Virus', 'Yeast', 1, 2, 3, 4, 5, 6, 7, 8, 9.")
def STD20(self):
return f"The 'Taxon name' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD21(self):
return f"The 'Taxon name' for strain with Accession Number {self.pk} is missing."
def STD22(self):
return f"The 'Taxon name' for strain with Accession Number {self.pk} is incorrect."
def STD23(self):
return (f"The 'Interspecific hybrid' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2.")
def STD24(self):
return f"The 'History of deposit' for strain with Accession Number {self.pk} is incorrect."
def STD25(self):
return (f"The 'Date of deposit' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD26(self):
return (f"The 'Date of inclusion in the catalogue' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD27(self):
return (f"The 'Date of collection' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD28(self):
return (f"The 'Date of isolation' for strain with Accession Number {self.pk} is incorrect."
" The allowed formats are 'YYYY-MM-DD', 'YYYYMMDD', 'YYYYMM', and 'YYYY'.")
def STD29(self):
return (f"The 'Tested temperature growth range' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','")
def STD30(self):
return f"The 'Recommended growth temperature' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD31(self):
return f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is missing."
def STD32(self):
return (f"The 'Recommended growth temperature' for strain with Accession Number {self.pk} is incorrect."
" It must have two decimal numbers separated by ','.")
def STD33(self):
return f"The 'Recommended medium for growth' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD34(self):
return f"The 'Recommended medium for growth' for strain with Accession Number {self.pk} is missing."
def STD35(self):
return f"The value of 'Recommended medium for growth' for strain with Accession Number {self.pk} is not in the Growth Media Sheet."
def STD36(self):
return f"The 'Forms of supply' column is a mandatory field in the Strains Sheet. The column can not be empty."
def STD37(self):
return f"The 'Forms of supply' for strain with Accession Number {self.pk} is missing."
def STD38(self):
return f"The value of 'Forms of supply' for strain with Accession Number {self.pk} is not in the Forms of Supply Sheet."
def STD39(self):
return (f"The 'Coordinates of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are two or three decimal numbers separated by ','. Moreover, the first number must be"
"between [-90, 90], the second between [-180, 180], and the third, if provided, can assume any value.")
def STD40(self):
return (f"The 'Altitude of geographic origin' column for strain with Accession Number {self.pk} is incorrect."
"The allowed formats are one decimal number between [-200, 8000].")
def STD41(self):
return f"The value of 'Ontobiotope term for the isolation habitat' for strain with Accession Number {self.pk} is not in the Ontobiotope Sheet."
def STD42(self):
return (f"The 'GMO' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 1, 2")
def STD43(self):
return (f"The 'Sexual State' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 'Mata', 'Matalpha', 'Mata/Matalpha', "
"'Matb', 'Mata/Matb', 'MTLa', 'MTLalpha', 'MTLa/MTLalpha', 'MAT1-1', 'MAT1-2', 'MAT1', 'MAT2', 'MT+', 'MT-'")
def STD44(self):
return (f"The 'Ploidy' for strain with Accession Number {self.pk} is not according to specification."
f" Your value is {self.value} and the accepted values are 0, 1, 2, 3, 4, 9")
def STD45(self):
msg = f"At least one of the values '{self.value}' of the literature field for strain {self.pk} are not in the literature sheet. "
msg += "If the those values are Pubmed ids or DOIs, please ignore this messsage"
return msg
"""
Genomic Information Error Codes
"""
def GID01(self):
return f"The 'Strain Acession Number' (Strain AN) column is a mandatory field in the Genomic Information Sheet."
def GID02(self):
return f"The 'Strain Acession Number' (Strain AN) column is empty or has missing values."
def GID03(self):
return f"The value of 'Strain Acession Number' (Strain AN) {self.value} is not in the Strains sheet."
def GID04(self):
return f"The 'Marker' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID05(self):
return f"The 'Marker' for genomic information with Strain AN {self.pk} is missing."
def GID06(self):
return f"The 'Marker' for genomic information with Strain AN {self.pk} is incorrect."
def GID07(self):
return f"The 'INSDC AN' column is a mandatory field in the Genomic Information Sheet. The column can not be empty."
def GID08(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is missing."
def GID09(self):
return f"The 'INSDC AN' for genomic information with Strain AN {self.pk} is incorrect."
def GID10(self):
return (f"The 'Sequence' for genomic information with Strain AN {self.pk} is incorrect."
" It must be a sequence of 'G', 'T', 'A', 'C' characteres of any length and without white spaces.")
"""
Ontobiotope Error Codes
"""
def OTD01(self):
return "The 'ID' columns is a mandatory field in the Ontobiotope Sheet."
def OTD02(self):
return "The 'ID' columns is empty or has missing values."
def OTD03(self):
return "The 'Name' columns is a mandatory field in the Ontobiotope Sheet. The column can not be empty."
def OTD04(self):
return f"The 'Name' for ontobiotope with ID {self.pk} is missing."

View File

@ -0,0 +1,483 @@
import re
from pathlib import Path
from io import BytesIO
from zipfile import BadZipfile
from datetime import datetime
from calendar import monthrange
from openpyxl import load_workbook
from mirri.io.parsers.excel import workbook_sheet_reader, get_all_cell_data_from_sheet
from mirri.validation.error_logging import ErrorLog, Error
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON,
TYPE, UNIQUE, VALIDATION, VALUES, BIBLIO)
from mirri.settings import LOCATIONS, SUBTAXAS
from mirri.validation.validation_conf_20200601 import MIRRI_20200601_VALLIDATION_CONF
def validate_mirri_excel(fhand, version="20200601"):
if version == "20200601":
configuration = MIRRI_20200601_VALLIDATION_CONF
else:
raise NotImplementedError("Only version20200601 is implemented")
return validate_excel(fhand, configuration)
def validate_excel(fhand, configuration):
validation_conf = configuration['sheet_schema']
cross_ref_conf = configuration['cross_ref_conf']
in_memory_sheet_conf = configuration['keep_sheets_in_memory']
excel_name = Path(fhand.name).stem
error_log = ErrorLog(excel_name)
try:
workbook = load_workbook(filename=BytesIO(
fhand.read()), read_only=True, data_only=True)
except (BadZipfile, IOError):
error = Error('EXL00', fhand.name, fhand.name)
error_log.add_error(error)
return error_log
# excel structure errors
structure_errors = list(validate_excel_structure(workbook, validation_conf))
if structure_errors:
for error in structure_errors:
error = Error(error[ERROR_CODE], pk=error['id'],
data=error['value'])
error_log.add_error(error)
return error_log
crossrefs = get_all_crossrefs(workbook, cross_ref_conf)
in_memory_sheets = get_all_in_memory_sheet(workbook, in_memory_sheet_conf)
content_errors = validate_content(workbook, validation_conf,
crossrefs, in_memory_sheets)
for error in content_errors:
# if error[ERROR_CODE] == 'STD43':
# continue
error = Error(error[ERROR_CODE], pk=error['id'], data=error['value'])
error_log.add_error(error)
return error_log
def validate_excel_structure(workbook, validation_conf):
for sheet_name, sheet_conf in validation_conf.items():
mandatory = sheet_conf.get(VALIDATION, {}).get(TYPE, None)
mandatory = mandatory == MANDATORY
error_code = sheet_conf.get(VALIDATION, {}).get(ERROR_CODE, False)
try:
sheet = workbook[sheet_name]
except KeyError:
sheet = None
if sheet is None:
if mandatory:
yield {'id': None, 'sheet': sheet_name, 'field': None,
'error_code': error_code, 'value': None}
continue
headers = _get_sheet_headers(sheet)
for column in sheet_conf.get(COLUMNS):
field = column[FIELD]
for step in column.get(VALIDATION, []):
if step[TYPE] == MANDATORY and field not in headers:
yield {'id': None, 'sheet': sheet_name, 'field': field,
'error_code': step[ERROR_CODE], 'value': None}
def _get_sheet_headers(sheet):
first_row = next(sheet.iter_rows(min_row=1, max_row=1))
return [c.value for c in first_row]
def _get_values_from_columns(workbook, sheet_name, columns):
indexed_values = {}
for row in workbook_sheet_reader(workbook, sheet_name):
for col in columns:
indexed_values[str(row.get(col))] = ""
return indexed_values
def get_all_crossrefs(workbook, cross_refs_names):
crossrefs = {}
for ref_name, columns in cross_refs_names.items():
if columns:
crossrefs[ref_name] = _get_values_from_columns(workbook, ref_name,
columns)
else:
try:
crossrefs[ref_name] = get_all_cell_data_from_sheet(workbook, ref_name)
except ValueError as error:
if 'sheet is missing' in str(error):
crossrefs[ref_name] = []
else:
raise
return crossrefs
def get_all_in_memory_sheet(workbook, in_memory_sheet_conf):
in_memory_sheets = {}
for sheet_conf in in_memory_sheet_conf:
sheet_name = sheet_conf['sheet_name']
indexed_by = sheet_conf['indexed_by']
rows = workbook_sheet_reader(workbook, sheet_name)
indexed_rows = {row[indexed_by]: row for row in rows}
in_memory_sheets[sheet_name] = indexed_rows
return in_memory_sheets
def validate_content(workbook, validation_conf, crossrefs, in_memory_sheets):
for sheet_name in validation_conf.keys():
sheet_conf = validation_conf[sheet_name]
sheet_id_column = sheet_conf['id_field']
shown_values = {}
row_validation_steps = sheet_conf.get(ROW_VALIDATION, None)
for row in workbook_sheet_reader(workbook, sheet_name):
id_ = row.get(sheet_id_column, None)
if id_ is None:
error_code = _get_missing_row_id_error(sheet_id_column,
sheet_conf)
yield {'id': id_, 'sheet': sheet_name,
'field': sheet_id_column,
'error_code': error_code, 'value': None}
continue
do_have_cell_error = False
for column in sheet_conf[COLUMNS]:
label = column[FIELD]
validation_steps = column.get(VALIDATION, None)
value = row.get(label, None)
if validation_steps:
error_code = validate_cell(value, validation_steps,
crossrefs, shown_values, label)
if error_code is not None:
do_have_cell_error = True
yield {'id': id_, 'sheet': sheet_name, 'field': label,
'error_code': error_code, 'value': value}
if not do_have_cell_error and row_validation_steps:
error_code = validate_row(
row, row_validation_steps, in_memory_sheets)
if error_code is not None:
yield {'id': id_, 'sheet': sheet_name, 'field': 'row',
'error_code': error_code, 'value': 'row'}
def _get_missing_row_id_error(sheet_id_column, sheet_conf):
error_code = None
for column in sheet_conf[COLUMNS]:
if column[FIELD] == sheet_id_column:
error_code = [step[ERROR_CODE]
for step in column[VALIDATION] if step[TYPE] == MISSING][0]
return error_code
def validate_row(row, validation_steps, in_memory_sheets):
for validation_step in validation_steps:
kind = validation_step[TYPE]
error_code = validation_step[ERROR_CODE]
if kind == NAGOYA:
if not is_valid_nagoya(row, in_memory_sheets):
return error_code
elif kind == BIBLIO:
if not is_valid_pub(row):
return error_code
else:
msg = f'{kind} is not a recognized row validation type method'
raise NotImplementedError(msg)
def validate_cell(value, validation_steps, crossrefs, shown_values, label):
for step_conf in validation_steps:
if step_conf[TYPE] == MANDATORY:
continue
step_conf['crossrefs_pointer'] = crossrefs
step_conf['shown_values'] = shown_values
step_conf['label'] = label
error_code = validate_value(value, step_conf)
if error_code is not None:
return error_code
def is_valid_pub(row):
title = row.get('Title', None)
full_reference = row.get('Full reference', None)
authors = row.get('Authors', None)
journal = row.get('Journal', None)
year = row.get('Year', None)
volumen = row.get('Volumen', None)
first_page = row.get('First page', None)
book_title = row.get('Book title', None)
editors = row.get('Editors', None)
publishers = row.get('Publishers', None)
if full_reference:
return True
is_journal = bool(title)
if (is_journal and (not authors or not journal or not not year or
not volumen or not first_page)):
return False
if (not is_journal and (not authors or not year or
not editors or not publishers or not book_title)):
return False
return True
def is_valid_nagoya(row, in_memory_sheets): # sourcery skip: return-identity
location_index = row.get('Geographic origin', None)
if location_index is None:
country = None
else:
geo_origin = in_memory_sheets[LOCATIONS].get(location_index, {})
country = geo_origin.get('Country', None)
_date = row.get("Date of collection", None)
if _date is None:
_date = row.get("Date of isolation", None)
if _date is None:
_date = row.get("Date of deposit", None)
if _date is None:
_date = row.get("Date of inclusion in the catalogue", None)
if _date is not None:
year = _date.year if isinstance(_date, datetime) else int(str(_date)[:4])
else:
year = None
if year is not None and year >= 2014 and country is None:
return False
return True
def is_valid_regex(value, validation_conf):
if value is None:
return True
value = str(value)
regexp = validation_conf[MATCH]
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, None)
values = [v.strip() for v in value.split(
separator)] if multiple else [value]
for value in values:
matches_regexp = re.fullmatch(regexp, value)
if not matches_regexp:
return False
return True
def is_valid_crossrefs(value, validation_conf):
crossref_name = validation_conf[CROSSREF_NAME]
crossrefs = validation_conf['crossrefs_pointer']
choices = crossrefs[crossref_name]
if value is None or not choices:
return True
value = str(value)
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, None)
if value is None:
return True
if multiple:
values = [v.strip() for v in value.split(separator)]
else:
values = [value.strip()]
return all(value in choices for value in values)
def is_valid_choices(value, validation_conf):
if value is None:
return True
choices = validation_conf[VALUES]
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, None)
if multiple:
values = [v.strip() for v in str(value).split(separator)]
else:
values = [str(value).strip()]
return all(value in choices for value in values)
def is_valid_date(value, validation_conf):
if value is None:
return True
if isinstance(value, datetime):
year = value.year
month = value.month
day = value.day
elif isinstance(value, int):
year = value
month = None
day = None
elif isinstance(value, str):
value = value.replace('-', '')
value = value.replace('/', '')
month = None
day = None
try:
year = int(value[: 4])
if len(value) >= 6:
month = int(value[4: 6])
if len(value) >= 8:
day = int(value[6: 8])
except (IndexError, TypeError, ValueError):
return False
else:
return False
if year < 1700 or year > datetime.now().year:
return False
if month is not None:
if month < 1 or month > 13:
return False
if day is not None and (day < 1 or day > monthrange(year, month)[1]):
return False
return True
def is_valid_coords(value, validation_conf=None):
# sourcery skip: return-identity
if value is None:
return True
try:
items = [i.strip() for i in value.split(";")]
latitude = float(items[0])
longitude = float(items[1])
if len(items) > 2:
precision = float(items[2])
if latitude < -90 or latitude > 90:
return False
if longitude < -180 or longitude > 180:
return False
return True
except:
return False
def is_valid_missing(value, validation_conf=None):
return value is not None
def is_valid_number(value, validation_conf):
if value is None:
return True
try:
value = float(value)
except TypeError:
return False
except ValueError:
return False
_max = validation_conf.get('max', None)
_min = validation_conf.get('min', None)
if (_max is not None and value > _max) or (_min is not None and value < _min):
return False
return True
def is_valid_taxon(value, validation_conf=None):
multiple = validation_conf.get(MULTIPLE, False)
separator = validation_conf.get(SEPARATOR, ';')
value = value.split(separator) if multiple else [value]
for taxon in value:
taxon = taxon.strip()
if not _is_valid_taxon(taxon):
return False
return True
def _is_valid_taxon(value):
value = value.strip()
if not value:
return True
items = re.split(r" +", value)
genus = items[0]
if len(items) > 1:
species = items[1]
if species in ("sp", "spp", ".sp", "sp."):
return False
if len(items) > 2:
for index in range(0, len(items[2:]), 2):
rank = SUBTAXAS.get(items[index + 2], None)
if rank is None:
print(value)
return False
return True
def is_valid_unique(value, validation_conf):
label = validation_conf['label']
shown_values = validation_conf['shown_values']
if label not in shown_values:
shown_values[label] = {}
already_in_file = shown_values[label]
if value in already_in_file:
return False
# NOTE: what's the use of this?
# What is the expected format for value and shown_values?
shown_values[label][value] = None
return True
def is_valid_file(path):
try:
with path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
if "EXL" in error_log.get_errors():
return False
except:
return False
return True
VALIDATION_FUNCTIONS = {
MISSING: is_valid_missing,
REGEXP: is_valid_regex,
CHOICES: is_valid_choices,
CROSSREF: is_valid_crossrefs,
DATE: is_valid_date,
COORDINATES: is_valid_coords,
NUMBER: is_valid_number,
TAXON: is_valid_taxon,
UNIQUE: is_valid_unique}
def validate_value(value, step_conf):
kind = step_conf[TYPE]
try:
is_value_valid = VALIDATION_FUNCTIONS[kind]
except KeyError:
msg = f'This validation type {kind} is not implemented'
raise NotImplementedError(msg)
error_code = step_conf[ERROR_CODE]
if not is_value_valid(value, step_conf):
return error_code

24
mirri/validation/tags.py Normal file
View File

@ -0,0 +1,24 @@
MANDATORY = "mandatory"
REGEXP = "regexp"
CHOICES = "choices"
CROSSREF = 'crossref'
CROSSREF_NAME = 'crossref_name'
MISSING = "missing"
VALIDATION = 'validation'
ERROR_CODE = 'error_code'
FIELD = 'field'
MULTIPLE = 'multiple'
TYPE = 'type'
COLUMNS = 'columns'
SOURCE = "sources"
SEPARATOR = "separator"
MATCH = 'match'
VALUES = 'values'
DATE = 'date'
COORDINATES = 'coord'
NUMBER = 'number'
TAXON = 'taxon'
UNIQUE = 'unique'
ROW_VALIDATION = 'row_validation'
NAGOYA = 'nagoya'
BIBLIO = 'bibliography'

View File

@ -0,0 +1,548 @@
from mirri.validation.tags import (CHOICES, COLUMNS, COORDINATES, CROSSREF, CROSSREF_NAME, DATE,
ERROR_CODE, FIELD, MANDATORY, MATCH,
MISSING, MULTIPLE, NAGOYA, NUMBER, REGEXP, ROW_VALIDATION, SEPARATOR, TAXON, TYPE,
UNIQUE,
VALIDATION, VALUES, BIBLIO)
from mirri.settings import (GEOGRAPHIC_ORIGIN, ONTOBIOTOPE, LOCATIONS, GROWTH_MEDIA, GENOMIC_INFO,
STRAINS, LITERATURE_SHEET, SEXUAL_STATE_SHEET)
# MARKERS,
# SEXUAL_STATE_SHEET,
# RESOURCE_TYPES_VALUES,
# FORM_OF_SUPPLY_SHEET,
# PLOIDY_SHEET)
STRAIN_FIELDS = [
{
FIELD: "Accession number",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: 'STD01'},
{TYPE: UNIQUE, ERROR_CODE: 'STD03'},
{TYPE: MISSING, ERROR_CODE: "STD02"},
{TYPE: REGEXP, MATCH: "[^ ]* [^ ]*", ERROR_CODE: "STD04"}
]
},
{
FIELD: "Restrictions on use",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD05"},
{TYPE: MISSING, ERROR_CODE: "STD06"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD07"}
]
},
{
FIELD: "Nagoya protocol restrictions and compliance conditions",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD08"},
{TYPE: MISSING, ERROR_CODE: "STD09"},
{TYPE: CHOICES, VALUES: ["1", "2", "3"],
MULTIPLE: False, ERROR_CODE: "STD10"}
]
},
{
FIELD: "ABS related files",
VALIDATION: [],
},
{
FIELD: "MTA file",
VALIDATION: [],
},
{
FIELD: "Other culture collection numbers",
# VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD07",
# MULTIPLE: True, SEPARATOR: ";"}
# ]
},
{
FIELD: "Strain from a registered collection",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD11"}
]
},
{
FIELD: "Risk Group",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD12"},
{TYPE: MISSING, ERROR_CODE: "STD13"},
{TYPE: CHOICES, VALUES: ["1", "2", "3", "4"],
MULTIPLE: False, ERROR_CODE: "STD14"}
]
},
{
FIELD: "Dual use",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD15"}
]
},
{
FIELD: "Quarantine in Europe",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD16"}
]
},
{
FIELD: "Organism type",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD17"},
{TYPE: MISSING, ERROR_CODE: "STD18"},
{TYPE: CHOICES, VALUES: ["Algae", "Archaea", "Bacteria",
"Cyanobacteria", "Filamentous Fungi",
"Phage", "Plasmid", "Virus", "Yeast",
"1", "2", "3", "4", "5", "6", "7", "8", "9"],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD19"}
]
},
{
FIELD: "Taxon name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD20"},
{TYPE: MISSING, ERROR_CODE: "STD21"},
{TYPE: TAXON, ERROR_CODE: "STD22", MULTIPLE: True,
SEPARATOR: ';'}
]
},
{
FIELD: "Infrasubspecific names",
},
{
FIELD: "Comment on taxonomy",
},
{
FIELD: "Interspecific hybrid",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD23"}
]
},
{
FIELD: "Status",
},
{
FIELD: "History of deposit",
VALIDATION: [
# {TYPE: REGEXP, "match": "[^ ]* [^ ]*", ERROR_CODE: "STD24", # modify the regex
# MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Depositor"
},
{
FIELD: "Date of deposit",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD25"},
]
},
{
FIELD: "Date of inclusion in the catalogue",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD26"},
]
},
{
FIELD: "Collected by",
},
{
FIELD: "Date of collection",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD27"},
]
},
{
FIELD: "Isolated by",
},
{
FIELD: "Date of isolation",
VALIDATION: [
{TYPE: DATE, ERROR_CODE: "STD28"},
]
},
{
FIELD: "Substrate/host of isolation",
},
{
FIELD: "Tested temperature growth range",
VALIDATION: [
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD29", MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended growth temperature",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD30"},
{TYPE: MISSING, ERROR_CODE: "STD31"},
{TYPE: REGEXP, "match": r'[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?',
ERROR_CODE: "STD32",
MULTIPLE: True, SEPARATOR: ";"}
]
},
{
FIELD: "Recommended medium for growth",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD33"},
{TYPE: MISSING, ERROR_CODE: "STD34"},
{TYPE: CROSSREF, CROSSREF_NAME: "Growth media",
MULTIPLE: True, SEPARATOR: "/", ERROR_CODE: "STD35"}
]
},
{
FIELD: "Form of supply",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "STD36"},
{TYPE: MISSING, ERROR_CODE: "STD37"},
{TYPE: CHOICES, VALUES: ['Agar', 'Cryo', 'Dry Ice', 'Liquid Culture Medium',
'Lyo', 'Oil', 'Water'],
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD38"}
]
},
{
FIELD: "Other denomination",
},
{
FIELD: "Coordinates of geographic origin",
VALIDATION: [
{TYPE: COORDINATES, ERROR_CODE: "STD39"},
]
},
{
FIELD: "Altitude of geographic origin",
VALIDATION: [
{TYPE: NUMBER, 'max': 8000, 'min': -200, ERROR_CODE: "STD40"},
]
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Geographic origin",
},
{
FIELD: "Isolation habitat",
},
{
FIELD: "Ontobiotope term for the isolation habitat",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: "Ontobiotope",
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD41"}
]
},
{
FIELD: "GMO",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["1", "2"],
ERROR_CODE: "STD42"}
]
},
{
FIELD: "GMO construction information",
},
{
FIELD: "Mutant information",
},
{
FIELD: "Genotype",
},
{
FIELD: "Sexual state",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: SEXUAL_STATE_SHEET,
ERROR_CODE: "STD43"}
# {TYPE: CHOICES, VALUES: ["Mata", "Matalpha", "Mata/Matalpha",
# "Matb", "Mata/Matb", "MTLa", "MTLalpha", "MTLa/MTLalpha",
# "MAT1-1", "MAT1-2", "MAT1", "MAT2", "MT+", "MT-"],
# ERROR_CODE: "STD43"}
]
},
{
FIELD: "Ploidy",
VALIDATION: [
{TYPE: CHOICES, VALUES: ["0", "1", "2", "3", "4", "9"],
ERROR_CODE: "STD44"}
]
},
{
FIELD: "Plasmids",
},
{
FIELD: "Plasmids collections fields",
},
{
# value can be in the cell or in another sheet. Don't configure this
FIELD: "Literature",
VALIDATION: [
{TYPE: CROSSREF, CROSSREF_NAME: LITERATURE_SHEET,
MULTIPLE: True, SEPARATOR: ";", ERROR_CODE: "STD45"}
]
},
{
FIELD: "Plant pathogenicity code",
},
{
FIELD: "Pathogenicity",
},
{
FIELD: "Enzyme production",
},
{
FIELD: "Production of metabolites",
},
{
FIELD: "Applications",
},
{
FIELD: "Remarks"
},
{
FIELD: "Literature linked to the sequence/genome",
},
]
SHEETS_SCHEMA = {
LOCATIONS: {
"acronym": "GOD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS02"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD01"},
{TYPE: MISSING, ERROR_CODE: "GOD02"},
]
},
{
FIELD: "Country",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD03"},
{TYPE: MISSING, ERROR_CODE: "GOD04"}
]
},
{
FIELD: "Region",
VALIDATION: []
},
{
FIELD: "City",
VALIDATION: []
},
{
FIELD: "Locality",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GOD06"},
{TYPE: MISSING, ERROR_CODE: "GOD07"}
]
}
],
},
GROWTH_MEDIA: {
"acronym": "GMD",
"id_field": "Acronym",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS01"},
COLUMNS: [
{
FIELD: "Acronym",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD01"},
{TYPE: MISSING, ERROR_CODE: "GMD02"}
]
},
{
FIELD: "Description",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GMD03"},
{TYPE: MISSING, ERROR_CODE: "GMD04"}
]
},
{
FIELD: "Full description",
VALIDATION: []
},
],
},
GENOMIC_INFO: {
"acronym": "GID",
"id_field": "Strain AN",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS08"},
COLUMNS: [
{
FIELD: "Strain AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID01"},
{TYPE: MISSING, ERROR_CODE: "GID02"},
{TYPE: CROSSREF, CROSSREF_NAME: "Strains",
ERROR_CODE: "GID03"},
]
},
{
FIELD: "Marker",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID04"},
{TYPE: MISSING, ERROR_CODE: "GID05"},
{TYPE: CHOICES, ERROR_CODE: "GID06",
VALUES: ['16S rRNA', 'ACT', 'CaM', 'EF-1α', 'ITS',
'LSU', 'RPB1', 'RPB2', 'TUBB']}
]
},
{
FIELD: "INSDC AN",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "GID07"},
{TYPE: MISSING, ERROR_CODE: "GID08"},
]
},
{
FIELD: "Sequence",
VALIDATION: []
},
],
},
STRAINS: {
"acronym": "STD",
'id_field': 'Accession number',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS05"},
ROW_VALIDATION: [
{TYPE: NAGOYA, ERROR_CODE: "STRXXX"},
],
COLUMNS: STRAIN_FIELDS,
},
LITERATURE_SHEET: {
"acronym": "LID",
'id_field': 'ID',
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS03"},
ROW_VALIDATION: [
{TYPE: BIBLIO, ERROR_CODE: 'LID17'}
],
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID01"},
{TYPE: MISSING, ERROR_CODE: "LID02"},
]
},
{
FIELD: "Full reference",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID03"},
]
},
{
FIELD: "Authors",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID05"},
]
},
{
FIELD: "Title",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID07"},
]
},
{
FIELD: "Journal",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID09"},
]
},
{
FIELD: "Year",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID11"},
]
},
{
FIELD: "Volume",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID13"},
]
},
{
FIELD: "Issue",
VALIDATION: []
},
{
FIELD: "First page",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "LID15"},
{TYPE: MISSING, ERROR_CODE: "LID16"},
]
},
{
FIELD: "Last page",
VALIDATION: []
},
{
FIELD: "Book title",
VALIDATION: []
},
{
FIELD: "Editors",
VALIDATION: []
},
{
FIELD: "Publisher",
VALIDATION: []
}
],
},
# SEXUAL_STATE_SHEET: {"acronym": "SSD", COLUMNS: []},
# RESOURCE_TYPES_VALUES: {"acronym": "RTD", COLUMNS: []},
# FORM_OF_SUPPLY_SHEET: {"acronym": "FSD", COLUMNS: []},
# PLOIDY_SHEET: {"acronym": "PLD", COLUMNS: []},
ONTOBIOTOPE: {
"acronym": "OTD",
"id_field": "ID",
VALIDATION: {TYPE: MANDATORY, ERROR_CODE: "EFS06"},
COLUMNS: [
{
FIELD: "ID",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD01"},
{TYPE: MISSING, ERROR_CODE: "OTD02"},
]
},
{
FIELD: "Name",
VALIDATION: [
{TYPE: MANDATORY, ERROR_CODE: "OTD03"},
{TYPE: MISSING, ERROR_CODE: "OTD04"},
]
},
]
},
# MARKERS: {
# "acronym": "MKD",
# "id_field": "",
# COLUMNS: [
# {
# FIELD: "Acronym",
# VALIDATION: []
# },
# {
# FIELD: "Marker",
# VALIDATION: []
# },
# ],
# },
}
CROSS_REF_CONF = {
ONTOBIOTOPE: ['ID', 'Name'],
LITERATURE_SHEET: ['ID'],
LOCATIONS: ['Locality'],
GROWTH_MEDIA: ['Acronym'],
STRAINS: ["Accession number"],
SEXUAL_STATE_SHEET: []
}
MIRRI_20200601_VALLIDATION_CONF = {
'sheet_schema': SHEETS_SCHEMA,
'cross_ref_conf': CROSS_REF_CONF,
'keep_sheets_in_memory': [
{'sheet_name': LOCATIONS, 'indexed_by': 'Locality'}]
}

5
requirements.txt Normal file
View File

@ -0,0 +1,5 @@
openpyxl
requests
requests_oauthlib
pycountry
deepdiff

35
setup.py Normal file
View File

@ -0,0 +1,35 @@
import setuptools
from pathlib import Path
from setuptools import find_packages
with open("README.md", "r") as fh:
long_description = fh.read()
requirements = [line.strip() for line in open('requirements.txt')]
scripts = [str(f) for f in Path('./bin').glob('*.py')]
setuptools.setup(
name="Mirri utils", # Replace with your own username
version=0.1,
author="P.Ziarsolo",
author_email="pziarsolo@gmail.com",
description="A small library to help dealing with MIRRI data",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/pziarsolo/mirri_utils",
packages=find_packages(),
package_data={"mirri": ['data/ontobiotopes.csv']},
# package_dir={"mirri.entities": "mirri.entities"
# "mirri.io.parsers": "mirri.io.parsers",
# "mirri.io.writers": "mirri.io.writers",
# 'mirri.validation': 'mirri.vallidation'},
install_requires=requirements,
scripts=scripts,
license="GNU General Public License v3.0",
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
)

0
tests/__init__.py Normal file
View File

View File

View File

@ -0,0 +1,22 @@
import unittest
from mirri.biolomics.remote.rest_client import BiolomicsClient
try:
from mirri.biolomics.secrets import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
except ImportError:
raise ImportError(
'You need a secrets.py in the project dir. with CLIENT_ID, SECRET_ID, USERNAME, PASSWORD')
from .utils import VERSION, SERVER_URL
class BiolomicsClientAuthTest(unittest.TestCase):
def test_authentication(self):
client = BiolomicsClient(SERVER_URL, VERSION, CLIENT_ID, SECRET_ID,
USERNAME, PASSWORD)
access1 = client.get_access_token()
access2 = client.get_access_token()
assert access1 is not None
self.assertEqual(access1, access2)

View File

@ -0,0 +1,62 @@
import unittest
from mirri.biolomics.remote.endoint_names import GROWTH_MEDIUM_WS
from mirri.biolomics.serializers.growth_media import GrowthMedium
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from tests.biolomics.utils import SERVER_URL, VERSION
class BiolomicsSequenceClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_media_by_id(self):
record_id = 101
growth_medium = self.client.retrieve_by_id('growth_medium', record_id)
self.assertEqual(growth_medium.record_id, record_id)
self.assertEqual(growth_medium.record_name, 'MA2PH6')
def test_retrieve_media_by_id(self):
record_name = 'MA2PH6'
record_id = 101
growth_medium = self.client.retrieve_by_name('growth_medium', record_name)
self.assertEqual(growth_medium.record_id, record_id)
self.assertEqual(growth_medium.record_name, record_name)
def test_create_growth_media(self):
self.client.start_transaction()
try:
growth_medium = GrowthMedium()
growth_medium.acronym = 'BBB'
growth_medium.ingredients = 'alkhdflakhf'
growth_medium.description = 'desc'
new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium)
print(new_growth_medium.dict())
finally:
self.client.rollback()
def test_update_growth_media(self):
self.client.start_transaction()
try:
growth_medium = GrowthMedium()
growth_medium.acronym = 'BBB'
growth_medium.ingredients = 'alkhdflakhf'
growth_medium.description = 'desc'
growth_medium.full_description = 'full'
new_growth_medium = self.client.create(GROWTH_MEDIUM_WS, growth_medium)
new_growth_medium.full_description = 'full2'
updated_gm = new_growth_medium = self.client.update(GROWTH_MEDIUM_WS, new_growth_medium)
self.assertEqual(updated_gm.full_description, new_growth_medium.full_description)
retrieved = self.client.retrieve_by_id(GROWTH_MEDIUM_WS, new_growth_medium.record_id)
self.assertEqual(retrieved.full_description, updated_gm.full_description)
finally:
self.client.rollback()

View File

@ -0,0 +1,46 @@
import unittest
from .utils import VERSION, SERVER_URL
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient, BIBLIOGRAPHY_WS
from mirri.entities.publication import Publication
class BiolomicsLiteratureClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_biblio_by_id(self):
record_id = 100
record_name = "Miscellaneous notes on Mucoraceae"
biblio = self.client.retrieve_by_id(BIBLIOGRAPHY_WS, record_id)
self.assertEqual(biblio.record_id, record_id)
self.assertEqual(biblio.record_name, record_name)
def test_retrieve_media_by_id(self):
record_id = 100
record_name = "Miscellaneous notes on Mucoraceae"
biblio = self.client.retrieve_by_name(BIBLIOGRAPHY_WS, record_name)
self.assertEqual(biblio.record_id, record_id)
self.assertEqual(biblio.record_name, record_name)
self.assertEqual(biblio.year, 1994)
self.assertEqual(biblio.volume, '50')
def test_create_biblio(self):
pub = Publication()
pub.pubmed_id = 'PM18192'
pub.journal = 'my_journal'
pub.title = 'awesome title'
pub.authors = 'pasdas, aposjdasd, alsalsfda'
pub.volume = 'volume 0'
record_id = None
try:
new_pub = self.client.create(BIBLIOGRAPHY_WS, pub)
record_id = new_pub.record_id
self.assertEqual(new_pub.title, pub.title)
self.assertEqual(new_pub.volume, pub.volume)
finally:
if record_id is not None:
self.client.delete_by_id(BIBLIOGRAPHY_WS, record_id)

View File

@ -0,0 +1,49 @@
import unittest
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.serializers.sequence import GenomicSequenceBiolomics
from .utils import VERSION, SERVER_URL
class BiolomicsSequenceClientTest(unittest.TestCase):
def setUp(self) -> None:
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_seq_by_id(self):
record_id = 101
sequence = self.client.retrieve_by_id('sequence', record_id)
self.assertEqual(sequence.record_id, record_id)
self.assertEqual(sequence.record_name, 'MUM 02.54 - CaM')
self.assertEqual(sequence.marker_type, 'CaM')
def test_retrieve_seq_by_name(self):
record_name = 'MUM 02.54 - CaM'
sequence = self.client.retrieve_by_name('sequence', record_name)
self.assertEqual(sequence.record_id, 101)
self.assertEqual(sequence.record_name, record_name)
self.assertEqual(sequence.marker_type, 'CaM')
def test_create_delete_sequence(self):
marker = GenomicSequenceBiolomics()
marker.marker_id = 'GGAAUUA'
marker.marker_seq = 'aattgacgat'
marker.marker_type = 'CaM'
marker.record_name = 'peioMarker'
new_marker = self.client.create('sequence', marker)
self.assertEqual(new_marker.marker_id, 'GGAAUUA')
self.assertEqual(new_marker.marker_seq, 'aattgacgat')
self.assertEqual(new_marker.marker_type, 'CaM')
self.assertEqual(new_marker.record_name, 'peioMarker')
self.assertTrue(new_marker.record_id)
self.client.delete_by_id('sequence', new_marker.record_id)
if __name__ == "__main__":
# import sys;sys.argv = ['', 'BiolomicsClient.Test.test_get_strain_by_id']
unittest.main()

View File

@ -0,0 +1,727 @@
import unittest
import pycountry
import deepdiff
from pprint import pprint
from mirri.biolomics.serializers.sequence import (
GenomicSequenceBiolomics,
serialize_to_biolomics as sequence_to_biolomics,
serialize_from_biolomics as sequence_from_biolomics)
from mirri.biolomics.serializers.strain import (
serialize_to_biolomics as strain_to_biolomics,
serialize_from_biolomics as strain_from_biolomics)
from mirri.biolomics.serializers.growth_media import (
# serialize_to_biolomics as growth_medium_to_biolomics,
serialize_from_biolomics as growth_medium_from_biolomics)
from mirri.biolomics.serializers.bibliography import (
serializer_from_biolomics as literature_from_biolomics,
serializer_to_biolomics as literature_to_biolomics
)
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.entities.publication import Publication
from .utils import create_full_data_strain, VERSION, SERVER_URL
STRAIN_WS = {
'CreationDate': '2021-05-19T12:22:33',
'CreatorUserName': 'pziarsolo@cect.org',
'LastChangeDate': '2021-05-19T12:22:36',
'LastChangeUserName': 'pziarsolo@cect.org',
'RecordDetails': {'ABS related files': {'FieldType': 21,
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Altitude of geographic origin': {'FieldType': 4,
'Value': 121.0},
'Applications': {'FieldType': 5, 'Value': 'health'},
'Catalog URL': {'FieldType': 21, 'Value': []},
'Collection accession number': {'FieldType': 5,
'Value': 'TESTCC 1'},
'Collection date': {'FieldType': 8, 'Value': '1991/01/01'},
'Collector': {'FieldType': 5, 'Value': 'the collector'},
'Comment on taxonomy': {'FieldType': 5,
'Value': 'lalalalla'},
'Coordinates of geographic origin': {'FieldType': 12,
'Value': {'Altitude': 0.0,
'Latitude': 23.3,
'Longitude': 23.3,
'Precision': 0.0}},
'Country': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'Spain'},
'RecordId': 54,
'TargetFieldValue': None}]},
'Data provided by': {'FieldType': 22, 'Value': 'Unknown'},
'Date of inclusion in the catalogue': {'FieldType': 8,
'Value': '1985/05/02'},
'Deposit date': {'FieldType': 8, 'Value': '1985/05/02'},
'Depositor': {'FieldType': 5,
'Value': 'NCTC, National Collection of Type '
'Cultures - NCTC, London, United '
'Kingdom of Great Britain and '
'Northern Ireland.'},
'Dual use': {'FieldType': 20, 'Value': 'yes'},
'Enzyme production': {'FieldType': 5,
'Value': 'some enzimes'},
'Form': {'FieldType': 3,
'Value': [{'Name': 'Agar', 'Value': 'yes'},
{'Name': 'Cryo', 'Value': 'no'},
{'Name': 'Dry Ice', 'Value': 'no'},
{'Name': 'Liquid Culture Medium',
'Value': 'no'},
{'Name': 'Lyo', 'Value': 'yes'},
{'Name': 'Oil', 'Value': 'no'},
{'Name': 'Water', 'Value': 'no'}]},
'GMO': {'FieldType': 22, 'Value': 'Yes'},
'GMO construction information': {'FieldType': 5,
'Value': 'instructrion to '
'build'},
'Genotype': {'FieldType': 5, 'Value': 'some genotupe'},
'Geographic origin': {'FieldType': 5,
'Value': 'una state; one '
'municipality; somewhere in '
'the world'},
'History': {'FieldType': 5,
'Value': 'newer < In the middle < older'},
'Infrasubspecific names': {'FieldType': 5,
'Value': 'serovar tete'},
'Interspecific hybrid': {'FieldType': 20, 'Value': 'no'},
'Isolation date': {'FieldType': 8, 'Value': '1900/01/01'},
'Isolation habitat': {'FieldType': 5,
'Value': 'some habitat'},
'Isolator': {'FieldType': 5, 'Value': 'the isolator'},
'Literature': {'FieldType': 118, 'Value': []},
'MTA files URL': {'FieldType': 21,
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'MTA text': {'FieldType': 5, 'Value': ''},
'Metabolites production': {'FieldType': 5,
'Value': 'big factory of cheese'},
'Mutant information': {'FieldType': 5, 'Value': 'x-men'},
'Nagoya protocol restrictions and compliance conditions': {'FieldType': 20,
'Value': 'no '
'known '
'restrictions '
'under '
'the '
'Nagoya '
'protocol'},
'Ontobiotope': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'anaerobic '
'bioreactor '
'(OBT:000190)'},
'RecordId': 100,
'TargetFieldValue': None}]},
'Ontobiotope term for the isolation habitat': {'FieldType': 5,
'Value': ''},
'Orders': {'FieldType': 118, 'Value': []},
'Organism type': {'FieldType': 3,
'Value': [{'Name': 'Algae', 'Value': 'no'},
{'Name': 'Archaea',
'Value': 'yes'},
{'Name': 'Bacteria',
'Value': 'no'},
{'Name': 'Cyanobacteria',
'Value': 'no'},
{'Name': 'Filamentous Fungi',
'Value': 'no'},
{'Name': 'Phage', 'Value': 'no'},
{'Name': 'Plasmid',
'Value': 'no'},
{'Name': 'Virus', 'Value': 'no'},
{'Name': 'Yeast', 'Value': 'no'},
{'Name': 'Microalgae',
'Value': '?'}]},
'Other culture collection numbers': {'FieldType': 5,
'Value': 'aaa a; aaa3 '
'a3'},
'Other denomination': {'FieldType': 5, 'Value': ''},
'Pathogenicity': {'FieldType': 5, 'Value': 'illness'},
'Plasmids': {'FieldType': 5, 'Value': 'asda'},
'Plasmids collections fields': {'FieldType': 5,
'Value': 'asdasda'},
'Ploidy': {'FieldType': 20, 'Value': 'Polyploid'},
'Quarantine in Europe': {'FieldType': 20, 'Value': 'no'},
'Recommended growth medium': {'FieldType': 118,
'Value': [{'Name': {'FieldType': 5,
'Value': 'AAA'},
'RecordId': 1,
'TargetFieldValue': None}]},
'Recommended growth temperature': {'FieldType': 19,
'MaxValue': 30.0,
'MinValue': 30.0},
'Remarks': {'FieldType': 5, 'Value': 'no remarks for me'},
'Restrictions on use': {'FieldType': 20,
'Value': 'no restriction apply'},
'Risk group': {'FieldType': 20, 'Value': '1'},
'Sequences 16s': {"Value": [
{
"Name": {
"Value": "X76436",
"FieldType": 5
},
"RecordId": 50992,
"TargetFieldValue": {
"Value": {
"Sequence": ""
},
"FieldType": 14
}
}
],
"FieldType": 114},
'Sequences 18S rRNA': {'FieldType': 114, 'Value': []},
'Sequences 23S rRNA': {'FieldType': 114, 'Value': []},
'Sequences ACT': {'FieldType': 114, 'Value': []},
'Sequences AmdS': {'FieldType': 114, 'Value': []},
'Sequences Amds12': {'FieldType': 114, 'Value': []},
'Sequences Beta tubulin': {'FieldType': 114, 'Value': []},
'Sequences COX1': {'FieldType': 114, 'Value': []},
'Sequences COX2': {'FieldType': 114, 'Value': []},
'Sequences CaM': {'FieldType': 114, 'Value': []},
'Sequences Cct8': {'FieldType': 114, 'Value': []},
'Sequences Cit1': {'FieldType': 114, 'Value': []},
'Sequences CypA': {'FieldType': 114, 'Value': []},
'Sequences GDP': {'FieldType': 114, 'Value': []},
'Sequences GPD': {'FieldType': 114, 'Value': []},
'Sequences Genome': {'FieldType': 114, 'Value': []},
'Sequences HIS': {'FieldType': 114, 'Value': []},
'Sequences HSP': {'FieldType': 114, 'Value': []},
'Sequences IDH': {'FieldType': 114, 'Value': []},
'Sequences IGS': {'FieldType': 114, 'Value': []},
'Sequences ITS': {'FieldType': 114, 'Value': []},
'Sequences LSU': {'FieldType': 114, 'Value': []},
'Sequences MAT': {'FieldType': 114, 'Value': []},
'Sequences MAT1': {'FieldType': 114, 'Value': []},
'Sequences Miscellaneous': {'FieldType': 114, 'Value': []},
'Sequences NorA': {'FieldType': 114, 'Value': []},
'Sequences NorB': {'FieldType': 114, 'Value': []},
'Sequences Omt12': {'FieldType': 114, 'Value': []},
'Sequences OmtA': {'FieldType': 114, 'Value': []},
'Sequences PcCYP': {'FieldType': 114, 'Value': []},
'Sequences PpgA': {'FieldType': 114, 'Value': []},
'Sequences PreA': {'FieldType': 114, 'Value': []},
'Sequences PreB': {'FieldType': 114, 'Value': []},
'Sequences RAPD': {'FieldType': 114, 'Value': []},
'Sequences RPB1': {'FieldType': 114, 'Value': []},
'Sequences RPB2': {'FieldType': 114, 'Value': []},
'Sequences SSU': {'FieldType': 114, 'Value': []},
'Sequences TEF1a': {'FieldType': 114, 'Value': []},
'Sequences TEF2': {'FieldType': 114, 'Value': []},
'Sequences TUB': {'FieldType': 114, 'Value': []},
'Sequences Tsr1': {'FieldType': 114, 'Value': []},
'Sequences c16S rRNA': {'FieldType': 114, 'Value': []},
'Sequences cbhI': {'FieldType': 114, 'Value': []},
'Sequences mcm7': {'FieldType': 114, 'Value': []},
'Sequences rbcL': {'FieldType': 114, 'Value': []},
'Sexual state': {'FieldType': 5, 'Value': 'MT+A'},
'Status': {'FieldType': 5,
'Value': 'type of Bacillus alcalophilus'},
'Strain from a registered collection': {'FieldType': 20,
'Value': 'no'},
'Substrate of isolation': {'FieldType': 5,
'Value': 'some substrate'},
'Taxon name': {'FieldType': 109,
'Value': [{'Name': {'FieldType': 5,
'Value': 'Escherichia '
'coli'},
'RecordId': 100004123,
'TargetFieldValue': {'DesktopInfo': None,
'DesktopInfoHtml': '<b>Current '
'name: '
'</b><i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'FieldType': 27,
'NewSynFieldInfo': None,
'ObligateSynonymId': 0,
'OriginalSynFieldInfo': None,
'SynInfo': {'BasionymRecord': {'NameInfo': '',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'CurrentNameRecord': {'NameInfo': '',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'ObligateSynonymRecords': [],
'SelectedRecord': {
'NameInfo': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'RecordId': 100004123,
'RecordName': '<i>Escherichia '
'coli</i> '
'(Migula '
'1895) '
'Castellani '
'and '
'Chalmers '
'1919',
'SecondLevelRecords': None},
'TaxonSynonymsRecords': []},
'SynonymId': 100004123}}]},
'Tested temperature growth range': {'FieldType': 19,
'MaxValue': 32.0,
'MinValue': 29.0},
'Type description': {'FieldType': 5, 'Value': ''}},
'RecordId': 148038,
'RecordName': 'MIRRI 2240561'}
STRAIN_WS_EXPECTED_NO_REMOTE = {
'Acronym': 'MIRRI',
'RecordDetails': {'ABS related files': {'FieldType': 'U',
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Altitude of geographic origin': {'FieldType': 'D',
'Value': 121},
'Applications': {'FieldType': 'E', 'Value': 'health'},
'Collection accession number': {'FieldType': 'E',
'Value': 'TESTCC 1'},
'Collection date': {'FieldType': 'H', 'Value': '1991-01-01'},
'Collector': {'FieldType': 'E', 'Value': 'the collector'},
'Comment on taxonomy': {'FieldType': 'E',
'Value': 'lalalalla'},
'Coordinates of geographic origin': {'FieldType': 'L',
'Value': {'Latitude': 23.3,
'Longitude': 23.3}},
'Date of inclusion in the catalogue': {'FieldType': 'H',
'Value': '1985-05-02'},
'Deposit date': {'FieldType': 'H', 'Value': '1985-05-02'},
'Depositor': {'FieldType': 'E',
'Value': 'NCTC, National Collection of Type '
'Cultures - NCTC, London, United '
'Kingdom of Great Britain and '
'Northern Ireland.'},
'Dual use': {'FieldType': 'T', 'Value': 'yes'},
'Enzyme production': {'FieldType': 'E',
'Value': 'some enzimes'},
'Form': {'FieldType': 'C',
'Value': [{'Name': 'Agar', 'Value': 'yes'},
{'Name': 'Cryo', 'Value': 'no'},
{'Name': 'Dry Ice', 'Value': 'no'},
{'Name': 'Liquid Culture Medium',
'Value': 'no'},
{'Name': 'Lyo', 'Value': 'yes'},
{'Name': 'Oil', 'Value': 'no'},
{'Name': 'Water', 'Value': 'no'}]},
'GMO': {'FieldType': 'V', 'Value': 'Yes'},
'GMO construction information': {'FieldType': 'E',
'Value': 'instructrion to '
'build'},
'Genotype': {'FieldType': 'E', 'Value': 'some genotupe'},
'Geographic origin': {'FieldType': 'E',
'Value': 'una state; one '
'municipality; somewhere in '
'the world'},
'History': {'FieldType': 'E',
'Value': 'firstplave < seconn place < third '
'place'},
'Infrasubspecific names': {'FieldType': 'E',
'Value': 'serovar tete'},
'Interspecific hybrid': {'FieldType': 'T', 'Value': 'no'},
'Isolation date': {'FieldType': 'H', 'Value': '1900-01-01'},
'Isolation habitat': {'FieldType': 'E',
'Value': 'some habitat'},
'Isolator': {'FieldType': 'E', 'Value': 'the isolator'},
'MTA files URL': {'FieldType': 'U',
'Value': [{'Name': 'link',
'Value': 'https://example.com'}]},
'Metabolites production': {'FieldType': 'E',
'Value': 'big factory of cheese'},
'Mutant information': {'FieldType': 'E', 'Value': 'x-men'},
'Nagoya protocol restrictions and compliance conditions': {'FieldType': 'T',
'Value': 'no '
'known '
'restrictions '
'under '
'the '
'Nagoya '
'protocol'},
'Ontobiotope': {'FieldType': 'RLink', 'Value': 'OBT:000190'},
'Organism type': {'FieldType': 'C',
'Value': [{'Name': 'Algae', 'Value': 'no'},
{'Name': 'Archaea',
'Value': 'yes'},
{'Name': 'Bacteria',
'Value': 'no'},
{'Name': 'Cyanobacteria',
'Value': 'no'},
{'Name': 'Filamentous Fungi',
'Value': 'no'},
{'Name': 'Phage', 'Value': 'no'},
{'Name': 'Plasmid',
'Value': 'no'},
{'Name': 'Virus', 'Value': 'no'},
{'Name': 'Yeast',
'Value': 'no'}]},
'Other culture collection numbers': {'FieldType': 'E',
'Value': 'aaa a; aaa3 '
'a3'},
'Pathogenicity': {'FieldType': 'E', 'Value': 'illness'},
'Plasmids': {'FieldType': 'E', 'Value': 'asda'},
'Plasmids collections fields': {'FieldType': 'E',
'Value': 'asdasda'},
'Ploidy': {'FieldType': 'T', 'Value': 'Polyploid'},
'Quarantine in Europe': {'FieldType': 'T', 'Value': 'no'},
'Recommended growth temperature': {'FieldType': 'S',
'MaxValue': 30.0,
'MinValue': 30.0},
'Remarks': {'FieldType': 'E', 'Value': 'no remarks for me'},
'Restrictions on use': {'FieldType': 'T',
'Value': 'no restriction apply'},
'Risk group': {'FieldType': 'T', 'Value': '1'},
'Sexual state': {'FieldType': 'E', 'Value': 'MT+A'},
'Status': {'FieldType': 'E',
'Value': 'type of Bacillus alcalophilus'},
'Strain from a registered collection': {'FieldType': 'T',
'Value': 'no'},
'Substrate of isolation': {'FieldType': 'E',
'Value': 'some substrate'},
'Taxon name': {'FieldType': 'SynLink',
'Value': 'Escherichia coli'},
'Tested temperature growth range': {'FieldType': 'S',
'MaxValue': 32.0,
'MinValue': 29.0}}}
class StrainSerializerTest(unittest.TestCase):
def test_serialize_to_biolomics(self):
strain = create_full_data_strain()
ws_strain = strain_to_biolomics(strain, client=None)
self.assertDictEqual(ws_strain, STRAIN_WS_EXPECTED_NO_REMOTE)
def test_serialize_to_biolomics_remote(self):
client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
strain = create_full_data_strain()
marker = GenomicSequenceBiolomics()
marker.marker_id = "MUM 02.15 - Beta tubulin"
marker.marker_type = 'TUBB'
strain.genetics.markers = [marker]
ws_strain = strain_to_biolomics(strain, client=client)
self.assertEqual(strain.collect.habitat_ontobiotope,
ws_strain['RecordDetails']['Ontobiotope']['Value'][0]['Name']['Value'])
self.assertEqual(pycountry.countries.get(alpha_3=strain.collect.location.country).name,
ws_strain['RecordDetails']['Country']['Value'][0]['Name']['Value'])
self.assertEqual(strain.publications[0].title,
ws_strain['RecordDetails']['Literature']['Value'][0]['Name']['Value'])
self.assertEqual(strain.genetics.markers[0].marker_id,
ws_strain['RecordDetails']['Sequences TUB']['Value'][0]['Name']['Value'])
def test_serialize_from_biolomics(self):
ws_strain = STRAIN_WS
strain = strain_from_biolomics(ws_strain)
self.assertEqual(strain.record_id, 148038)
self.assertEqual(strain.record_name, 'MIRRI 2240561')
self.assertEqual(strain.taxonomy.long_name, 'Escherichia coli')
self.assertEqual(strain.growth.recommended_media, ['AAA'])
self.assertEqual(strain.collect.location.altitude, 121)
self.assertEqual(strain.collect.location.country, 'ESP')
self.assertEqual(strain.applications, 'health')
self.assertEqual(strain.id.strain_id, 'TESTCC 1')
self.assertEqual(strain.collect.date.strfdate, '19910101')
self.assertEqual(strain.taxonomy.comments, 'lalalalla')
self.assertEqual(strain.catalog_inclusion_date.strfdate, '19850502')
self.assertIn('NCTC, National Collection of Type ', strain.deposit.who)
self.assertTrue(strain.is_potentially_harmful)
self.assertEqual(strain.form_of_supply, ['Agar', 'Lyo'])
self.assertTrue(strain.genetics.gmo)
self.assertEqual(strain.genetics.gmo_construction, 'instructrion to build')
self.assertEqual(strain.genetics.genotype, 'some genotupe')
self.assertEqual(strain.history, ['newer', 'In the middle', 'older'])
self.assertEqual(strain.taxonomy.infrasubspecific_name, 'serovar tete')
self.assertEqual(strain.isolation.who, 'the isolator')
self.assertEqual(strain.isolation.date.strfdate, '19000101')
self.assertEqual(strain.mta_files, ['https://example.com'])
self.assertEqual(strain.genetics.mutant_info, 'x-men')
self.assertEqual(strain.collect.habitat_ontobiotope, 'OBT:000190')
self.assertEqual(strain.taxonomy.organism_type[0].name, 'Archaea')
self.assertEqual(strain.other_numbers[0].strain_id, 'aaa a')
self.assertEqual(strain.other_numbers[1].strain_id, 'aaa3 a3')
self.assertEqual(strain.pathogenicity, 'illness')
self.assertEqual(strain.genetics.plasmids, ['asda'])
self.assertEqual(strain.genetics.ploidy, 9)
self.assertFalse(strain.is_subject_to_quarantine)
self.assertEqual(strain.risk_group, '1')
self.assertFalse(strain.is_from_registered_collection)
self.assertEqual(strain.growth.tested_temp_range, {'min': 29, 'max': 32})
BIOLOMICSSEQ = {
'RecordDetails': {
'Barcode level': {'FieldType': 20, 'Value': 'undefined'},
'DNA extract number': {'FieldType': 5, 'Value': ''},
'DNA sequence': {'FieldType': 14,
'Value': {'Sequence': 'caaaggaggccttctccctcttcgtaag'}},
'Editing state': {'FieldType': 20, 'Value': 'Auto import'},
'Forward primer(s)': {'FieldType': 5, 'Value': ''},
'Genbank': {'FieldType': 21, 'Value': []},
'INSDC number': {'FieldType': 5, 'Value': 'AATGAT'},
'Literature': {'FieldType': 21, 'Value': []},
'Literature1': {'FieldType': 118, 'Value': []},
'Marker name': {'FieldType': 5, 'Value': 'CaM'},
'Privacy': {'FieldType': 20, 'Value': 'undefined'},
'Quality': {'FieldType': 5, 'Value': ''},
'Remarks': {'FieldType': 5, 'Value': ''},
'Reverse primer(s)': {'FieldType': 5, 'Value': ''},
'Review state': {'FieldType': 5, 'Value': ''},
'Strain number': {'FieldType': 5, 'Value': 'MUM 02.54'}},
'RecordId': 101,
'RecordName': 'MUM 02.54 - CaM'}
class SequenceSerializerTest(unittest.TestCase):
def test_from_biolomics(self):
marker = sequence_from_biolomics(BIOLOMICSSEQ)
self.assertEqual(marker.record_name, BIOLOMICSSEQ['RecordName'])
self.assertEqual(marker.record_id, BIOLOMICSSEQ['RecordId'])
self.assertEqual(marker.marker_type, BIOLOMICSSEQ['RecordDetails']['Marker name']['Value'])
self.assertEqual(marker.marker_id, BIOLOMICSSEQ['RecordDetails']['INSDC number']['Value'])
self.assertEqual(marker.marker_seq, BIOLOMICSSEQ['RecordDetails']['DNA sequence']['Value']['Sequence'])
def test_to_biolomics(self):
marker = GenomicSequenceBiolomics()
marker.marker_id = 'GGAAUUA'
marker.marker_seq = 'aattgacgat'
marker.marker_type = 'CaM'
marker.record_name = 'peioMarker'
marker.record_id = 111
ws_seq = sequence_to_biolomics(marker)
expected = {'RecordId': marker.record_id,
'RecordName': marker.record_name,
'RecordDetails': {
'INSDC number': {'Value': marker.marker_id, 'FieldType': 'E'},
'DNA sequence': {'Value': {'Sequence': marker.marker_seq}, 'FieldType': 'N'},
'Marker name': {'Value': marker.marker_type, 'FieldType': 'E'}}}
self.assertEqual(ws_seq, expected)
BIOLOMICS_MEDIUM = {
"RecordId": 100,
"RecordName": "MA20S",
"RecordDetails": {
"Full description": {
"Value": "mout agar+20% saccharose",
"FieldType": 5
},
"Ingredients": {
"Value": "Malt extract\r\n\tDilute brewery malt with water to 10% sugar solution (level 10 on Brix saccharose meter), 15 minutes at 121 C\r\nsaccharose\t200g\r\ndistilled water\t0.6l\r\nagar\t15g\r\n",
"FieldType": 5
},
"Link to full description": {
"Value": [],
"FieldType": 21
},
"Medium description": {
"Value": "",
"FieldType": 5
},
"Other name": {
"Value": "",
"FieldType": 5
},
"pH": {
"Value": "7 with KOH",
"FieldType": 5
},
"Remarks": {
"Value": "",
"FieldType": 5
},
"Reference": {
"Value": "",
"FieldType": 5
},
"Sterilization conditions": {
"Value": "15 minutes at 121 C",
"FieldType": 5
}
}
}
class MediumSerializerTest(unittest.TestCase):
def test_from_biolomics(self):
medium = growth_medium_from_biolomics(BIOLOMICS_MEDIUM)
self.assertEqual(medium.record_id, BIOLOMICS_MEDIUM['RecordId'])
self.assertEqual(medium.record_name, BIOLOMICS_MEDIUM['RecordName'])
self.assertEqual(medium.ingredients, BIOLOMICS_MEDIUM['RecordDetails']['Ingredients']['Value'])
self.assertEqual(medium.full_description, BIOLOMICS_MEDIUM['RecordDetails']['Full description']['Value'])
self.assertEqual(medium.ph, BIOLOMICS_MEDIUM['RecordDetails']['pH']['Value'])
BIOLOMICS_BIBLIOGRAPHY = {
"RecordId": 100,
"RecordName": "Miscellaneous notes on Mucoraceae",
"RecordDetails": {
"Associated strains": {
"Value": [],
"FieldType": 118
},
"Associated taxa": {
"Value": [],
"FieldType": 118
},
"Authors": {
"Value": "Schipper, M.A.A.; Samson, R.A.",
"FieldType": 5
},
"Associated sequences": {
"Value": [],
"FieldType": 118
},
"Abstract": {
"Value": "",
"FieldType": 5
},
"Collection": {
"Value": "",
"FieldType": 5
},
"DOI number": {
"Value": "",
"FieldType": 5
},
"Editor(s)": {
"Value": "",
"FieldType": 5
},
"Full reference": {
"Value": "",
"FieldType": 5
},
"Hyperlink": {
"Value": [],
"FieldType": 21
},
"ISBN": {
"Value": "",
"FieldType": 5
},
"ISSN": {
"Value": "",
"FieldType": 5
},
"Issue": {
"Value": "",
"FieldType": 5
},
"Journal": {
"Value": "Mycotaxon",
"FieldType": 5
},
"Journal-Book": {
"Value": "",
"FieldType": 5
},
"Keywords": {
"Value": "",
"FieldType": 5
},
"Page from": {
"Value": "475",
"FieldType": 5
},
"Page to": {
"Value": "491",
"FieldType": 5
},
"Publisher": {
"Value": "",
"FieldType": 5
},
"PubMed ID": {
"Value": "",
"FieldType": 5
},
"Volume": {
"Value": "50",
"FieldType": 5
},
"Year": {
"Value": 1994,
"FieldType": 4
}
}
}
class BibliographySerializerTest(unittest.TestCase):
def test_from_biolomics(self):
pub = literature_from_biolomics(BIOLOMICS_BIBLIOGRAPHY)
self.assertEqual(pub.record_name, "Miscellaneous notes on Mucoraceae")
self.assertEqual(pub.record_id, 100)
self.assertEqual(pub.year, 1994)
self.assertEqual(pub.authors, "Schipper, M.A.A.; Samson, R.A.")
def test_to_biolomics(self):
pub = Publication()
pub.title = 'My title'
pub.year = 1992
pub.authors = 'me and myself'
pub.pubmed_id = '1112222'
pub.issue = 'issue'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'Authors': {'FieldType': 'E', 'Value': 'me and myself'},
'PubMed ID': {'FieldType': 'E', 'Value': '1112222'},
'Issue': {'FieldType': 'E', 'Value': 'issue'},
'Year': {'FieldType': 'D', 'Value': 1992}},
'RecordName': 'My title'}
self.assertDictEqual(expected, ws_data)
def test_to_biolomics2(self):
pub = Publication()
pub.pubmed_id = '1112222'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'PubMed ID': {'FieldType': 'E', 'Value': '1112222'}},
'RecordName': f'PUBMED:{pub.pubmed_id}'}
self.assertDictEqual(expected, ws_data)
pub = Publication()
pub.doi = 'doi.er/111/12131'
ws_data = literature_to_biolomics(pub)
expected = {
'RecordDetails': {
'DOI number': {'FieldType': 'E', 'Value': pub.doi}},
'RecordName': f'DOI:{pub.doi}'}
self.assertDictEqual(expected, ws_data)
if __name__ == "__main__":
import sys;
sys.argv = ['', 'BibliographySerializerTest']
unittest.main()

View File

@ -0,0 +1,156 @@
import unittest
from mirri.biolomics.remote.endoint_names import STRAIN_WS
from .utils import VERSION, SERVER_URL, create_full_data_strain
from mirri.biolomics.settings import CLIENT_ID, SECRET_ID, USERNAME, PASSWORD
from mirri.biolomics.remote.biolomics_client import BiolomicsMirriClient
from mirri.biolomics.pipelines.strain import retrieve_strain_by_accession_number
class BiolomicsStrainClientTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def test_retrieve_strain_by_id(self):
record_id = 14803
strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(strain.record_id, record_id)
print(strain.record_name)
def test_retrieve_strain_by_name(self):
record_id = 14803
record_name = 'MIRRI0014803'
strain = self.client.retrieve_by_name(STRAIN_WS, record_name)
self.assertEqual(strain.record_name, record_name)
self.assertEqual(strain.record_id, record_id)
def test_search_strain(self):
accession_number = "BEA 0014B"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
self.assertEqual(search_response['total'], 1)
self.assertEqual(search_response['records'][0].id.strain_id,
accession_number)
def test_search_strain4(self):
accession_number = "TESTCC 1"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
for strain in search_response['records']:
print(strain)
self.client.delete_by_id(STRAIN_WS, strain.record_id)
def test_search_strain_no_found(self):
accession_number = "BEA 0014B_"
query = {"Query": [{"Index": 0,
"FieldName": "Collection accession number",
"Operation": "TextExactMatch",
"Value": accession_number}],
"Expression": "Q0",
"DisplayStart": 0,
"DisplayLength": 10}
search_response = self.client.search(STRAIN_WS, query)
self.assertEqual(search_response['total'], 0)
self.assertFalse(search_response['records'])
def test_create_strain(self):
strain = create_full_data_strain()
strain.taxonomy.interspecific_hybrid = None
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertIsNone(new_strain.taxonomy.interspecific_hybrid)
self.assertEqual(new_strain.growth.recommended_media, ['AAA'])
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
finally:
if record_id is not None:
self.client.delete_by_id(STRAIN_WS, record_id)
def test_update_strain(self):
strain = create_full_data_strain()
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
self.assertFalse(new_strain.taxonomy.interspecific_hybrid)
new_strain.id.number = '2'
new_strain.taxonomy.interspecific_hybrid = None
updated_strain = self.client.update(STRAIN_WS, new_strain)
self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(updated_strain.taxonomy.interspecific_hybrid)
retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(retrieved_strain.taxonomy.interspecific_hybrid)
finally:
if record_id is not None:
print('deleting')
self.client.delete_by_id(STRAIN_WS, record_id)
def test_update_strain_pathogenicity(self):
strain = create_full_data_strain()
print(strain.pathogenicity)
record_id = None
try:
new_strain = self.client.create(STRAIN_WS, strain)
record_id = new_strain.record_id
self.assertEqual(new_strain.id.strain_id, strain.id.strain_id)
self.assertEqual(new_strain.pathogenicity, 'illness')
new_strain.pathogenicity = None
updated_strain = self.client.update(STRAIN_WS, new_strain)
self.assertEqual(updated_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(updated_strain.pathogenicity)
retrieved_strain = self.client.retrieve_by_id(STRAIN_WS, record_id)
self.assertEqual(retrieved_strain.id.strain_id, new_strain.id.strain_id)
self.assertIsNone(retrieved_strain.pathogenicity)
finally:
if record_id is not None:
self.client.delete_by_id(STRAIN_WS, record_id)
def test_search_by_accession_number(self):
accession_number = "BEA 0014B"
strain = retrieve_strain_by_accession_number(self.client, accession_number)
self.assertEqual(strain.id.strain_id, accession_number)
def test_search_by_accession_number(self):
accession_number = "BEA 0014B_"
strain = retrieve_strain_by_accession_number(self.client, accession_number)
self.assertFalse(strain)
class BiolomicsClientGrowthMediaTest(unittest.TestCase):
def setUp(self):
self.client = BiolomicsMirriClient(SERVER_URL, VERSION, CLIENT_ID,
SECRET_ID, USERNAME, PASSWORD)
def xtest_growth_media_by_name(self):
gm = self.client.retrieve('growth_media', 'AAA')
self.assertEqual(gm['Record Id'], 1)
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'BiolomicsWriter.test_mirri_excel_parser_invalid']
unittest.main()

99
tests/biolomics/utils.py Normal file
View File

@ -0,0 +1,99 @@
from mirri.biolomics.serializers.strain import StrainMirri
from mirri.entities.strain import StrainId, OrganismType
from mirri.entities.sequence import GenomicSequence
from mirri.entities.date_range import DateRange
from mirri.entities.publication import Publication
from mirri.settings import NAGOYA_NO_RESTRICTIONS
VERSION = 'v2'
SERVER_URL = 'https://webservices.bio-aware.com/mirri_test'
def create_full_data_strain():
strain = StrainMirri()
strain.id.number = "1"
strain.id.collection = "TESTCC"
strain.id.url = "https://cect/2342"
strain.restriction_on_use = "no_restriction"
strain.nagoya_protocol = NAGOYA_NO_RESTRICTIONS
strain.abs_related_files = ['https://example.com']
strain.mta_files = ['https://example.com']
strain.other_numbers.append(StrainId(collection="aaa", number="a"))
strain.other_numbers.append(StrainId(collection="aaa3", number="a3"))
strain.is_from_registered_collection = False
strain.risk_group = '1'
strain.is_potentially_harmful = True
strain.is_subject_to_quarantine = False
strain.taxonomy.organism_type = [OrganismType(2)]
strain.taxonomy.genus = 'Escherichia'
strain.taxonomy.species = 'coli'
strain.taxonomy.interspecific_hybrid = False
strain.taxonomy.infrasubspecific_name = 'serovar tete'
strain.taxonomy.comments = 'lalalalla'
strain.status = "type of Bacillus alcalophilus"
strain.history = 'firstplave < seconn place < third place'
strain.deposit.who = "NCTC, National Collection of Type Cultures - NCTC, London, United Kingdom of Great Britain and Northern Ireland."
strain.deposit.date = DateRange(year=1985, month=5, day=2)
strain.catalog_inclusion_date = DateRange(year=1985, month=5, day=2)
strain.collect.location.country = "ESP"
strain.collect.location.state = "una state"
strain.collect.location.municipality = "one municipality"
strain.collect.location.longitude = 23.3
strain.collect.location.latitude = 23.3
strain.collect.location.altitude = 121
strain.collect.location.site = "somewhere in the world"
strain.collect.habitat_ontobiotope = "OBT:000190"
strain.collect.habitat = 'some habitat'
strain.collect.who = "the collector"
strain.collect.date = DateRange(year=1991)
strain.isolation.date = DateRange(year=1900)
strain.isolation.who = 'the isolator'
strain.isolation.substrate_host_of_isolation = 'some substrate'
# already existing media in test_mirri
strain.growth.recommended_temp = {'min': 30, 'max': 30}
strain.growth.recommended_media = ["AAA"]
strain.growth.tested_temp_range = {'min': 29, 'max': 32}
strain.form_of_supply = ["Agar", "Lyo"]
#strain.other_denominations = ["lajdflasjdldj"]
gen_seq = GenomicSequence()
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
strain.genetics.markers.append(gen_seq)
strain.genetics.ploidy = 9
strain.genetics.genotype = 'some genotupe'
strain.genetics.gmo = True
strain.genetics.gmo_construction = 'instructrion to build'
strain.genetics.mutant_info = 'x-men'
strain.genetics.sexual_state = 'MT+A'
strain.genetics.plasmids = ['asda']
strain.genetics.plasmids_in_collections = ['asdasda']
pub = Publication()
pub.title = "The genus Amylomyces"
strain.publications = [pub]
strain.plant_pathogenicity_code = 'PATH:001'
strain.pathogenicity = 'illness'
strain.enzyme_production = 'some enzimes'
strain.production_of_metabolites = 'big factory of cheese'
strain.applications = 'health'
strain.remarks = 'no remarks for me'
return strain
if __name__ == '__main__':
strain = create_full_data_strain()
print(strain.collect.habitat_ontobiotope)

Binary file not shown.

View File

@ -0,0 +1,5 @@
{
"key1": "value1",
"key2": "value2",
"key3": "value3"
}

Binary file not shown.

Binary file not shown.

BIN
tests/data/valid.mirri.xlsx Normal file

Binary file not shown.

318
tests/test_entities.py Normal file
View File

@ -0,0 +1,318 @@
"""
Created on 2020(e)ko abe. 2(a)
@author: peio
"""
import unittest
from mirri.entities.publication import Publication
from mirri.entities.date_range import DateRange
from mirri.entities.location import Location
from mirri.entities.sequence import GenomicSequence
from mirri.entities.strain import (
Collect,
Deposit,
Isolation,
ValidationError,
OrganismType,
Strain,
StrainId,
Taxonomy,
)
from mirri.settings import (
COLLECT,
COUNTRY,
DATE_OF_ISOLATION,
DEPOSIT,
DEPOSITOR,
GENETICS,
GROWTH,
ISOLATED_BY,
ISOLATION,
LOCATION,
MARKERS,
NAGOYA_DOCS_AVAILABLE,
NAGOYA_PROTOCOL,
ORGANISM_TYPE,
OTHER_CULTURE_NUMBERS,
PLOIDY,
RECOMMENDED_GROWTH_MEDIUM,
TAXONOMY,
DATE_OF_INCLUSION, NO_RESTRICTION
)
from mirri.validation.entity_validators import validate_strain
class TestDataRange(unittest.TestCase):
def test_data_range_init(self):
dr = DateRange()
self.assertFalse(dr)
self.assertEqual(dr.__str__(), "")
self.assertEqual(dr.range["start"], None)
self.assertEqual(dr.range["end"], None)
dr.strpdate("2012")
self.assertEqual(dr.strfdate, "2012----")
self.assertTrue(dr)
dr.strpdate("2012----")
self.assertEqual(dr.strfdate, "2012----")
dr.strpdate("201212--")
self.assertEqual(dr.strfdate, "201212--")
try:
dr.strpdate("201213--")
self.fail()
except ValueError:
pass
try:
dr = DateRange(year=2012, month=13)
self.fail()
except ValueError:
pass
dr = DateRange(year=2020)
self.assertEqual(dr.strfdate, "2020----")
dr2 = dr.strpdate("2012")
self.assertEqual(dr2.range["start"].year, 2012)
self.assertEqual(dr2.range["start"].month, 1)
self.assertEqual(dr2.range["start"].day, 1)
self.assertEqual(dr2.range["end"].year, 2012)
self.assertEqual(dr2.range["end"].month, 12)
self.assertEqual(dr2.range["end"].day, 31)
class TestCollect(unittest.TestCase):
def test_collect_basic(self):
collect = Collect()
self.assertEqual(collect.dict(), {})
collect.location.country = "ESP"
collect.date = DateRange().strpdate("2012----")
collect.who = "pepito"
self.assertEqual(
dict(collect.dict()),
{
"location": {"countryOfOriginCode": "ESP"},
"collected_by": "pepito",
"date_of_collection": "2012----",
},
)
self.assertEqual(collect.__str__(),
"Collected: Spain in 2012---- by pepito")
class TestOrganismType(unittest.TestCase):
def test_basic_usage(self):
org_type = OrganismType(2)
self.assertEqual(org_type.name, "Archaea")
self.assertEqual(org_type.code, 2)
try:
org_type.ko = 'a'
self.fail()
except TypeError:
pass
org_type = OrganismType("Archaea")
class TestTaxonomy(unittest.TestCase):
def test_taxonomy_basic(self):
taxonomy = Taxonomy()
self.assertEqual(taxonomy.dict(), {})
self.assertFalse(taxonomy)
def test_taxonomy_with_data(self):
taxonomy = Taxonomy()
taxonomy.genus = "Bacilus"
taxonomy.organism_type = [OrganismType("Archaea")]
taxonomy.species = "vulgaris"
self.assertEqual(taxonomy.long_name, "Bacilus vulgaris")
# print(taxonomy.dict())
class TestLocation(unittest.TestCase):
def test_empty_init(self):
loc = Location()
self.assertEqual(loc.dict(), {})
self.assertFalse(loc)
def test_add_data(self):
loc = Location()
loc.country = "esp"
self.assertEqual(loc.dict(), {COUNTRY: "esp"})
loc.state = None
self.assertEqual(loc.dict(), {COUNTRY: "esp"})
class TestStrain(unittest.TestCase):
def test_empty_strain(self):
strain = Strain()
self.assertEqual(strain.dict(), {})
def test_strain_add_data(self):
strain = Strain()
strain.id.number = "5433"
strain.id.collection = "CECT"
strain.id.url = "https://cect/2342"
try:
strain.nagoya_protocol = "asdas"
self.fail()
except ValidationError:
pass
strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE
strain.dict()[NAGOYA_PROTOCOL] = NAGOYA_DOCS_AVAILABLE
strain.collect.location.country = "ESP"
self.assertEqual(strain.dict()[COLLECT][LOCATION][COUNTRY], "ESP")
strain.genetics.ploidy = 9
self.assertEqual(strain.dict()[GENETICS][PLOIDY], 9)
strain.growth.recommended_media = ["asd"]
strain.isolation.date = DateRange(year=1900)
self.assertEqual(strain.dict()[ISOLATION]
[DATE_OF_ISOLATION], "1900----")
strain.deposit.who = "pepe"
self.assertEqual(strain.dict()[DEPOSIT][DEPOSITOR], "pepe")
strain.growth.recommended_media = ["11"]
self.assertEqual(strain.dict()[GROWTH]
[RECOMMENDED_GROWTH_MEDIUM], ["11"])
strain.taxonomy.organism_type = [OrganismType(2)]
self.assertEqual(
strain.dict()[TAXONOMY][ORGANISM_TYPE], [
{"code": 2, "name": "Archaea"}]
)
strain.taxonomy.organism_type = [OrganismType("Algae")]
self.assertEqual(
strain.dict()[TAXONOMY][ORGANISM_TYPE], [
{"code": 1, "name": "Algae"}]
)
strain.other_numbers.append(StrainId(collection="aaa", number="a"))
strain.other_numbers.append(StrainId(collection="aaa3", number="a3"))
self.assertEqual(
strain.dict()[OTHER_CULTURE_NUMBERS],
[
{"collection_code": "aaa", "accession_number": "a"},
{"collection_code": "aaa3", "accession_number": "a3"},
],
)
strain.form_of_supply = ["Agar", "Lyo"]
gen_seq = GenomicSequence()
self.assertEqual(gen_seq.dict(), {})
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
strain.genetics.markers.append(gen_seq)
self.assertEqual(
strain.dict()[GENETICS][MARKERS],
[{"marker_type": "16S rRNA", "INSDC": "pepe"}],
)
strain.collect.habitat_ontobiotope = "OBT:111111"
self.assertEqual(strain.collect.habitat_ontobiotope, "OBT:111111")
try:
strain.collect.habitat_ontobiotope = "OBT:11111"
self.fail()
except ValidationError:
pass
# publications
try:
strain.publications = 1
self.fail()
except ValidationError:
pass
pub = Publication()
pub.id = "1"
try:
strain.publications = pub
self.fail()
except ValidationError:
pass
strain.publications = [pub]
self.assertEqual(strain.publications[0].id, "1")
strain.catalog_inclusion_date = DateRange(year=1992)
self.assertEqual(strain.dict()[DATE_OF_INCLUSION], '1992----')
import pprint
pprint.pprint(strain.dict())
def test_strain_validation(self):
strain = Strain()
strain.form_of_supply = ['Lyo']
return
errors = validate_strain(strain)
self.assertEqual(len(errors), 10)
strain.id.collection = 'test'
strain.id.number = '1'
errors = validate_strain(strain)
self.assertEqual(len(errors), 9)
strain.nagoya_protocol = NAGOYA_DOCS_AVAILABLE
strain.restriction_on_use = NO_RESTRICTION
strain.risk_group = 1
strain.taxonomy.organism_type = [OrganismType(4)]
strain.taxonomy.hybrids = ['Sac lac', 'Sac lcac3']
strain.growth.recommended_media = ['aa']
strain.growth.recommended_temp = {'min': 2, 'max':5}
strain.form_of_supply = ['lyo']
strain.collect.location.country = 'ESP'
errors = validate_strain(strain)
self.assertFalse(errors)
class TestIsolation(unittest.TestCase):
def test_iniatialize_isollation(self):
isolation = Isolation()
self.assertEqual(isolation.dict(), {})
isolation.who = "pepito"
self.assertTrue(ISOLATED_BY in isolation.dict())
isolation.date = DateRange().strpdate("2012----")
self.assertTrue(DATE_OF_ISOLATION in isolation.dict())
try:
isolation.location.site = "spain"
self.fail()
except (ValueError, AttributeError):
pass
class TestGenomicSequence(unittest.TestCase):
def test_empty_init(self):
gen_seq = GenomicSequence()
self.assertEqual(gen_seq.dict(), {})
gen_seq.marker_id = "pepe"
gen_seq.marker_type = "16S rRNA"
self.assertEqual(gen_seq.dict(), {
"marker_type": "16S rRNA", "INSDC": "pepe"})
if __name__ == "__main__":
# import sys;sys.argv = ['', 'TestStrain']
unittest.main()

51
tests/test_parsers.py Normal file
View File

@ -0,0 +1,51 @@
from mirri.entities.strain import ValidationError
import unittest
from pathlib import Path
from pprint import pprint
from mirri.io.parsers.mirri_excel import parse_mirri_excel
TEST_DATA_DIR = Path(__file__).parent / "data"
class MirriExcelTests(unittest.TestCase):
def test_mirri_excel_parser(self):
in_path = TEST_DATA_DIR / "valid.mirri.xlsx"
with in_path.open("rb") as fhand:
parsed_data = parse_mirri_excel(fhand, version="20200601")
medium = parsed_data["growth_media"][0]
self.assertEqual("1", medium.acronym)
self.assertEqual(medium.description, "NUTRIENT BROTH/AGAR I")
strains = list(parsed_data["strains"])
strain = strains[0]
self.assertEqual(strain.publications[0].id, 1)
self.assertEqual(strain.publications[0].title, 'Cosa')
self.assertEqual(strain.id.number, "1")
pprint(strain.dict())
def xtest_mirri_excel_parser_invalid_fail(self):
in_path = TEST_DATA_DIR / "invalid.mirri.xlsx"
with in_path.open("rb") as fhand:
try:
parse_mirri_excel(fhand, version="20200601")
self.fail()
except ValidationError:
pass
def xtest_mirri_excel_parser_invalid(self):
in_path = TEST_DATA_DIR / "invalid.mirri.xlsx"
with in_path.open("rb") as fhand:
parsed_data = parse_mirri_excel(
fhand, version="20200601")
errors = parsed_data["errors"]
for _id, _errors in errors.items():
print(_id, _errors)
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'MirriExcelTests.test_mirri_excel_parser_invalid']
unittest.main()

589
tests/test_validation.py Normal file
View File

@ -0,0 +1,589 @@
from datetime import datetime
import unittest
from pathlib import Path
from itertools import chain
from mirri.validation.tags import (
CHOICES,
COORDINATES,
CROSSREF,
CROSSREF_NAME,
DATE,
MATCH,
MISSING,
MULTIPLE,
NUMBER,
REGEXP,
SEPARATOR,
TAXON,
TYPE,
UNIQUE,
VALUES
)
from mirri.validation.excel_validator import (
is_valid_choices,
is_valid_coords,
is_valid_crossrefs,
is_valid_date,
is_valid_missing,
is_valid_number,
is_valid_regex,
is_valid_taxon,
is_valid_unique,
is_valid_file,
validate_mirri_excel,
)
TEST_DATA_DIR = Path(__file__).parent / "data"
TS_VALUE = "value"
TS_CONF = "conf"
TS_ASSERT = "assert_func"
class MirriExcelValidationTests(unittest.TestCase):
def test_validation_structure(self):
in_path = TEST_DATA_DIR / "invalid_structure.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
entities = []
err_codes = []
for ett, errors in error_log.get_errors().items():
entities.append(ett)
err_codes.extend([err.code for err in errors])
self.assertIn("EFS", entities)
self.assertIn("STD", entities)
self.assertIn("GOD", entities)
self.assertIn("GMD", entities)
self.assertIn("EFS03", err_codes)
self.assertIn("EFS06", err_codes)
self.assertIn("EFS08", err_codes)
self.assertIn("GOD06", err_codes)
self.assertIn("GMD01", err_codes)
self.assertIn("STD05", err_codes)
self.assertIn("STD08", err_codes)
self.assertIn("STD12", err_codes)
def test_validation_content(self):
in_path = TEST_DATA_DIR / "invalid_content.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
entities = []
err_codes = []
for ett, errors in error_log.get_errors().items():
entities.append(ett)
err_codes.extend([err.code for err in errors])
self.assertTrue(len(err_codes) > 0)
self.assertNotIn("EFS", entities)
self.assertIn("STD", entities)
self.assertIn("GOD", entities)
self.assertIn("GID", entities)
self.assertIn("GOD04", err_codes)
self.assertIn("GOD07", err_codes)
self.assertIn("GID03", err_codes)
self.assertIn("STD11", err_codes)
self.assertIn("STD15", err_codes)
self.assertIn("STD22", err_codes)
self.assertIn("STD04", err_codes)
self.assertIn("STD10", err_codes)
self.assertIn("STD07", err_codes)
self.assertIn("STD14", err_codes)
self.assertIn("STD16", err_codes)
def test_validation_valid(self):
in_path = TEST_DATA_DIR / "valid.mirri.xlsx"
with in_path.open("rb") as fhand:
error_log = validate_mirri_excel(fhand)
self.assertTrue(len(error_log.get_errors()) == 0)
class ValidatoionFunctionsTest(unittest.TestCase):
def test_is_valid_regex(self):
tests = [
{
TS_VALUE: "abcDEF",
TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123456",
TS_CONF: {TYPE: REGEXP, MATCH: r"[a-zA-Z]+"},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "123456",
TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abcdef",
TS_CONF: {TYPE: REGEXP, MATCH: r"\d+"},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "abc 123",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123 abc",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "123 ",
TS_CONF: {TYPE: REGEXP, MATCH: r"\w+(\s\w+)*$"},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_regex(value, conf))
def test_is_valid_choices(self):
tests = [
{
TS_VALUE: "1",
TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "1, 3",
TS_CONF: {
TYPE: CHOICES,
VALUES: ["1", "2", "3", "4"],
MULTIPLE: True,
SEPARATOR: ","
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "5",
TS_CONF: {TYPE: CHOICES, VALUES: ["1", "2", "3", "4"]},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_choices(value, conf))
def test_is_valid_crossref(self):
tests = [
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
},
TS_ASSERT: self.assertTrue,
},
{
TS_VALUE: "123",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
},
TS_ASSERT: self.assertFalse,
},
{
TS_VALUE: "abc, def",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
MULTIPLE: True,
SEPARATOR: ",",
},
TS_ASSERT: self.assertTrue,
},
{
TS_VALUE: "abc, 123",
TS_CONF: {
TYPE: CROSSREF,
CROSSREF_NAME: "values",
"crossrefs_pointer": {"values": ["abc", "def", "ghi"]},
MULTIPLE: True,
SEPARATOR: ",",
},
TS_ASSERT: self.assertFalse,
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_crossrefs(value, conf))
def test_is_valid_missing(self):
tests = [
{
TS_VALUE: 1,
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abc",
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: None,
TS_CONF: {TYPE: MISSING},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_missing(value, conf))
def test_is_valid_date(self):
tests = [
{
TS_VALUE: '2020-04-07',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020/04/07',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: datetime(2021, 5, 1),
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020-05',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2020/05',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 2020,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: '2021 05 01',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '04-07-2020',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '2021-02-31',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '2021-15',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: '15-2021',
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 3000,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: -2020,
TS_CONF: {TYPE: DATE},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_date(value, conf))
def test_is_valid_coordinates(self):
tests = [
{
TS_VALUE: "23; 50",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "-90; -100",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "90; 100",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "0; 0",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10; 20; 5",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10; 20; -5",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "91; 50",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "87; 182",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "-200; 182",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "20, 40",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: "abc def",
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 123,
TS_CONF: {TYPE: COORDINATES},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_coords(value, conf))
def test_is_valid_number(self):
tests = [
{
TS_VALUE: 1,
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 2.5,
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "10.5",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "min": 0},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "max": 10},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 5,
TS_CONF: {TYPE: NUMBER, "min": 0, "max": 10},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "hello",
TS_CONF: {TYPE: NUMBER},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 10,
TS_CONF: {TYPE: NUMBER, "max": 5},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 0,
TS_CONF: {TYPE: NUMBER, "min": 5},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_number(value, conf))
def test_is_valid_taxon(self):
tests = [
{
TS_VALUE: 'sp. species',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp species subsp. subspecies',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp species subsp. subspecies var. variety',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'spp taxon',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'Candidaceae',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: 'sp sp species',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertFalse
},
{
TS_VALUE: 'spp species abc. def',
TS_CONF: {TYPE: TAXON},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_taxon(value, conf))
def test_is_valid_unique(self):
tests = [
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {}
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "jkl",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {
"values": {"abc": '',
"def": '',
"ghi": ''},
}
},
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: "abc",
TS_CONF: {
TYPE: UNIQUE,
"label": "values",
"shown_values": {
"values": {"abc": '',
"def": '',
"ghi": ''},
}
},
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
conf = test[TS_CONF]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_unique(value, conf))
def test_is_valid_file(self):
tests = [
{
TS_VALUE: TEST_DATA_DIR / "invalid_structure.mirri.xlsx",
TS_ASSERT: self.assertTrue
},
{
TS_VALUE: TEST_DATA_DIR / "invalid_excel.mirri.json",
TS_ASSERT: self.assertFalse
},
]
for test in tests:
value = test[TS_VALUE]
assert_func = test[TS_ASSERT]
with self.subTest(value=value):
assert_func(is_valid_file(value,))
if __name__ == "__main__":
import sys
# sys.argv = ['',
# 'ValidatoionFunctionsTest.test_is_valid_regex']
unittest.main()

24
tests/test_writers.py Normal file
View File

@ -0,0 +1,24 @@
import unittest
from pathlib import Path
from mirri.io.writers.mirri_excel import write_mirri_excel
from mirri.io.parsers.mirri_excel import parse_mirri_excel
TEST_DATA_DIR = Path(__file__).parent / "data"
class MirriExcelTests(unittest.TestCase):
def test_valid_excel(self):
in_path = TEST_DATA_DIR / "valid.mirri.full.xlsx"
parsed_data = parse_mirri_excel(in_path.open('rb'), version="20200601")
strains = parsed_data["strains"]
growth_media = parsed_data["growth_media"]
out_path = Path("/tmp/test.xlsx")
write_mirri_excel(out_path, strains, growth_media, version="20200601")
if __name__ == "__main__":
# import sys;sys.argv = ['',
# 'BiolomicsWriter.test_mirri_excel_parser_invalid']
unittest.main()