Source code for idutils

# -*- coding: utf-8 -*-
# This file is part of IDUtils
# Copyright (C) 2015-2018 CERN.
# Copyright (C) 2018 Alan Rubin.
# IDUtils is free software; you can redistribute it and/or modify
# it under the terms of the Revised BSD License; see LICENSE file for
# more details.
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""Small library for persistent identifiers used in scholarly communication."""

from __future__ import absolute_import, print_function

import re

from isbn import ISBN
from six.moves.urllib.parse import urlparse

from .version import __version__

    "ENSPMA",  # Petromyzon marinus (Lamprey)
    "ENSNGA",  # Nannospalax galili (Upper Galilee mountains blind mole rat)
    "ENSOPR",  # Ochotona princeps (Pika)
    "ENSMNE",  # Macaca nemestrina (Pig-tailed macaque)
    "MGP_C57BL6NJ_",  # Mus musculus (Mouse C57BL/6NJ)
    "MGP_LPJ_",  # Mus musculus (Mouse LP/J)
    "FB",  # Drosophila melanogaster (Fruitfly)
    "ENSORL",  # Oryzias latipes (Medaka)
    "ENSONI",  # Oreochromis niloticus (Tilapia)
    "ENSOCU",  # Oryctolagus cuniculus (Rabbit)
    "ENSXET",  # Xenopus tropicalis (Xenopus)
    "ENSRRO",  # Rhinopithecus roxellana (Golden snub-nosed monkey)
    "ENSCAT",  # Cercocebus atys (Sooty mangabey)
    "ENSAME",  # Ailuropoda melanoleuca (Panda)
    "MGP_CASTEiJ_",  # Mus musculus castaneus (Mouse CAST/EiJ)
    "ENSCSAV",  # Ciona savignyi
    "ENSMAU",  # Mesocricetus auratus (Golden Hamster)
    "ENSFAL",  # Ficedula albicollis (Flycatcher)
    "ENSTRU",  # Takifugu rubripes (Fugu)
    "ENSPTR",  # Pan troglodytes (Chimpanzee)
    "ENSTTR",  # Tursiops truncatus (Dolphin)
    "ENSCJA",  # Callithrix jacchus (Marmoset)
    "ENSSAR",  # Sorex araneus (Shrew)
    "ENSVPA",  # Vicugna pacos (Alpaca)
    "ENSLAC",  # Latimeria chalumnae (Coelacanth)
    "ENSPVA",  # Pteropus vampyrus (Megabat)
    "ENSPAN",  # Papio anubis (Olive baboon)
    "ENSHGLF",  # Heterocephalus glaber (Naked mole-rat female)
    "MGP_PWKPhJ_",  # Mus musculus musculus (Mouse PWK/PhJ)
    "MGP_NZOHlLtJ_",  # Mus musculus (Mouse NZO/HlLtJ)
    "ENSCAF",  # Canis lupus familiaris (Dog)
    "MGP_AJ_",  # Mus musculus (Mouse A/J)
    "ENSMOD",  # Monodelphis domestica (Opossum)
    "ENSMGA",  # Meleagris gallopavo (Turkey)
    "ENSPCO",  # Propithecus coquereli (Coquerel's sifaka)
    "ENSFDA",  # Fukomys damarensis (Damara mole rat)
    "ENSBTA",  # Bos taurus (Cow)
    "ENSGAL",  # Gallus gallus (Chicken)
    "ENSLAF",  # Loxodonta africana (Elephant)
    "ENSGGO",  # Gorilla gorilla gorilla (Gorilla)
    "ENSCAP",  # Cavia aperea (Brazilian guinea pig)
    "ENSMMU",  # Macaca mulatta (Macaque)
    "ENSAPL",  # Anas platyrhynchos (Duck)
    "ENSCEL",  # Caenorhabditis elegans (Caenorhabditis elegans)
    "ENSMEU",  # Notamacropus eugenii (Wallaby)
    "ENSCGR",  # Cricetulus griseus (Chinese hamster CriGri)
    "ENSANA",  # Aotus nancymaae (Ma's night monkey)
    "ENSGMO",  # Gadus morhua (Cod)
    "ENSPEM",  # Peromyscus maniculatus bairdii (Northern American deer mouse)
    "MGP_C3HHeJ_",  # Mus musculus (Mouse C3H/HeJ)
    "ENSTGU",  # Taeniopygia guttata (Zebra Finch)
    "ENSSCE",  # Saccharomyces cerevisiae (Saccharomyces cerevisiae)
    "ENSOGA",  # Otolemur garnettii (Bushbaby)
    "ENSACA",  # Anolis carolinensis (Anole lizard)
    "ENSTSY",  # Carlito syrichta (Tarsier)
    "ENSTBE",  # Tupaia belangeri (Tree Shrew)
    "MGP_AKRJ_",  # Mus musculus (Mouse AKR/J)
    "ENSDAR",  # Danio rerio (Zebrafish)
    "ENSMUS",  # Mus musculus (Mouse)
    "ENSETE",  # Echinops telfairi (Lesser hedgehog tenrec)
    "ENSSBO",  # Saimiri boliviensis boliviensis (Bolivian squirrel monkey)
    "ENS",  # Homo sapiens (Human)
    "ENSCGR",  # Cricetulus griseus (Chinese hamster CHOK1GS)
    "ENSFCA",  # Felis catus (Cat)
    "MGP_BALBcJ_",  # Mus musculus (Mouse BALB/cJ)
    "MGP_PahariEiJ_",  # Mus pahari (Shrew mouse)
    "ENSCSA",  # Chlorocebus sabaeus (Vervet-AGM)
    "ENSCCA",  # Cebus capucinus imitator (Capuchin)
    "ENSOAR",  # Ovis aries (Sheep)
    "ENSCHI",  # Capra hircus (Goat)
    "ENSDOR",  # Dipodomys ordii (Kangaroo rat)
    "ENSCHO",  # Choloepus hoffmanni (Sloth)
    "ENSSHA",  # Sarcophilus harrisii (Tasmanian devil)
    "ENSMPU",  # Mustela putorius furo (Ferret)
    "ENSNLE",  # Nomascus leucogenys (Gibbon)
    "ENSXMA",  # Xiphophorus maculatus (Platyfish)
    "ENSSSC",  # Sus scrofa (Pig)
    "ENSEEU",  # Erinaceus europaeus (Hedgehog)
    "ENSPSI",  # Pelodiscus sinensis (Chinese softshell turtle)
    "MGP_DBA2J_",  # Mus musculus (Mouse DBA/2J)
    "ENSAMX",  # Astyanax mexicanus (Cave fish)
    "MGP_WSBEiJ_",  # Mus musculus domesticus (Mouse WSB/EiJ)
    "ENSJJA",  # Jaculus jaculus (Lesser Egyptian jerboa)
    "ENSCIN",  # Ciona intestinalis
    "ENSPPA",  # Pan paniscus (Bonobo)
    "MGP_SPRETEiJ_",  # Mus spretus (Algerian mouse)
    "ENSCAN",  # Colobus angolensis palliatus (Angola colobus)
    "MGP_NODShiLtJ_",  # Mus musculus (Mouse NOD/ShiLtJ)
    "ENSCLA",  # Chinchilla lanigera (Long-tailed chinchilla)
    "ENSCPO",  # Cavia porcellus (Guinea Pig)
    "ENSDNO",  # Dasypus novemcinctus (Armadillo)
    "ENSPFO",  # Poecilia formosa (Amazon molly)
    "ENSMIC",  # Microcebus murinus (Mouse Lemur)
    "MGP_FVBNJ_",  # Mus musculus (Mouse FVB/NJ)
    "MGP_CBAJ_",  # Mus musculus (Mouse CBA/J)
    "ENSSTO",  # Ictidomys tridecemlineatus (Squirrel)
    "ENSRNO",  # Rattus norvegicus (Rat)
    "ENSMOC",  # Microtus ochrogaster (Prairie vole)
    "ENSTNI",  # Tetraodon nigroviridis (Tetraodon)
    "ENSPPY",  # Pongo abelii (Orangutan)
    "ENSGAC",  # Gasterosteus aculeatus (Stickleback)
    "ENSLOC",  # Lepisosteus oculatus (Spotted gar)
    "ENSODE",  # Octodon degus (Degu)
    "ENSPCA",  # Procavia capensis (Hyrax)
    "ENSECA",  # Equus caballus (Horse)
    "ENSOAN",  # Ornithorhynchus anatinus (Platypus)
    "MGP_CAROLIEiJ_",  # Mus caroli (Ryukyu mouse)
    "ENSHGLM",  # Heterocephalus glaber (Naked mole-rat male)
    "MGP_129S1SvImJ_",  # Mus musculus (Mouse 129S1/SvImJ)
    "ENSRBI",  # Rhinopithecus bieti (Black snub-nosed monkey)
    "ENSMLU",  # Myotis lucifugus (Microbat)
    "ENSMLE",  # Mandrillus leucophaeus (Drill)
    "ENSMFA",  # Macaca fascicularis (Crab-eating macaque)
"""List of species-specific prefixes for Ensembl accession numbers."""

doi_regexp = re.compile(

handle_regexp = re.compile(

<Handle>          = <NamingAuthority> "/" <LocalName>
<NamingAuthority> = *(<NamingAuthority>  ".") <NAsegment>
<NAsegment>       = Any UTF8 char except "/" and "."
<LocalName>       = Any UTF8 char

arxiv_post_2007_regexp = re.compile(
"""See and"""

arxiv_pre_2007_regexp = re.compile(
"""See and"""

arxiv_post_2007_with_class_regexp = re.compile(
"""Matches new style arXiv ID, with an old-style class specification;
    technically malformed, however appears in real data."""

ads_regexp = re.compile("(ads:|ADS:)?(\d{4}[A-Za-z]\S{13}[A-Z.:])$")

pmcid_regexp = re.compile("PMC\d+$", flags=re.I)
"""PubMed Central ID regular expression."""

pmid_regexp = re.compile("(pmid:)?(\d+)$", flags=re.I)
"""PubMed ID regular expression."""

ark_suffix_regexp = re.compile("ark:/\d+/.+$")
"""See and"""

lsid_regexp = re.compile("urn:lsid:[^:]+(:[^:]+){2,3}$", flags=re.I)

orcid_url = ""

gnd_regexp = re.compile(

gnd_resolver_url = ""

sra_regexp = re.compile("[SED]R[APRSXZ]\d+$")
"""Sequence Read Archive regular expression."""

bioproject_regexp = re.compile("PRJ(NA|EA|EB|DB)\d+$")
"""BioProject regular expression."""

biosample_regexp = re.compile("SAM(N|EA|D)\d+$")
"""BioSample regular expression."""

ensembl_regexp = re.compile("({prefixes})(E|FM|G|GT|P|R|T)\d{{11}}$".format(
"""Ensembl regular expression."""

uniprot_regexp = re.compile("([A-N,R-Z][0-9]([A-Z][A-Z,0-9]{2}[0-9]){1,2})|"
"""UniProt regular expression."""

refseq_regexp = re.compile("((AC|NC|NG|NT|NW|NM|NR|XM|XR|AP|NP|YP|XP|WP)_|"
"""RefSeq regular expression."""

genome_regexp = re.compile("GC[AF]_\d+\.\d+$")
"""GenBank or RefSeq genome assembly accession."""

def _convert_x_to_10(x):
    """Convert char to int with X being converted to 10."""
    return int(x) if x != 'X' else 10

[docs]def is_isbn10(val): """Test if argument is an ISBN-10 number. Courtesy Wikipedia: """ val = val.replace("-", "").replace(" ", "").upper() if len(val) != 10: return False try: r = sum([(10 - i) * (_convert_x_to_10(x)) for i, x in enumerate(val)]) return not (r % 11) except ValueError: return False
[docs]def is_isbn13(val): """Test if argument is an ISBN-13 number. Courtesy Wikipedia: """ val = val.replace("-", "").replace(" ", "").upper() if len(val) != 13: return False try: total = sum([ int(num) * weight for num, weight in zip(val, (1, 3) * 6) ]) ck = (10 - total) % 10 return ck == int(val[-1]) except ValueError: return False
[docs]def is_isbn(val): """Test if argument is an ISBN-10 or ISBN-13 number.""" if is_isbn10(val) or is_isbn13(val): if val[0:3] in ["978", "979"] or not is_ean13(val): return True return False
[docs]def is_issn(val): """Test if argument is an ISSN number.""" try: val = val.replace("-", "").replace(" ", "").upper() if len(val) != 8: return False r = sum([(8 - i) * (_convert_x_to_10(x)) for i, x in enumerate(val)]) return not (r % 11) except ValueError: return False
[docs]def is_istc(val): """Test if argument is a International Standard Text Code. See """ val = val.replace("-", "").replace(" ", "").upper() if len(val) != 16: return False sequence = [11, 9, 3, 1] try: r = sum([int(x, 16)*sequence[i % 4] for i, x in enumerate(val[:-1])]) ck = hex(r % 16)[2:].upper() return ck == val[-1] except ValueError: return False
[docs]def is_doi(val): """Test if argument is a DOI.""" return doi_regexp.match(val)
[docs]def is_handle(val): """Test if argument is a Handle. Note, DOIs are also handles, and handle are very generic so they will also match e.g. any URL your parse. """ return handle_regexp.match(val)
[docs]def is_ean8(val): """Test if argument is a International Article Number (EAN-8).""" if len(val) != 8: return False sequence = [3, 1] try: r = sum([int(x)*sequence[i % 2] for i, x in enumerate(val[:-1])]) ck = (10 - r % 10) % 10 return ck == int(val[-1]) except ValueError: return False
[docs]def is_ean13(val): """Test if argument is a International Article Number (EAN-13).""" if len(val) != 13: return False sequence = [1, 3] try: r = sum([int(x)*sequence[i % 2] for i, x in enumerate(val[:-1])]) ck = (10 - r % 10) % 10 return ck == int(val[-1]) except ValueError: return False
[docs]def is_ean(val): """Test if argument is a International Article Number (EAN-13 or EAN-8). See """ return is_ean13(val) or is_ean8(val)
[docs]def is_isni(val): """Test if argument is an International Standard Name Identifier.""" val = val.replace("-", "").replace(" ", "").upper() if len(val) != 16: return False try: r = 0 for x in val[:-1]: r = (r + int(x))*2 ck = (12 - r % 11) % 11 return ck == _convert_x_to_10(val[-1]) except ValueError: return False
[docs]def is_orcid(val): """Test if argument is an ORCID ID. See articles/116780-structure-of-the-orcid-identifier """ if val.startswith(orcid_url): val = val[len(orcid_url):] val = val.replace("-", "").replace(" ", "") if is_isni(val): val = int(val[:-1], 10) # Remove check digit and convert to int. return val >= 15000000 and val <= 35000000 return False
def is_ark(val): """Test if argument is an ARK.""" res = urlparse(val) return ark_suffix_regexp.match(val) or ( res.scheme == 'http' and res.netloc != '' and # Note res.path includes leading slash, hence [1:] to use same reexp ark_suffix_regexp.match(res.path[1:]) and res.params == '' )
[docs]def is_purl(val): """Test if argument is a PURL.""" res = urlparse(val) return (res.scheme == 'http' and res.netloc in ['', '', '', ''] and res.path != '')
[docs]def is_url(val): """Test if argument is a URL.""" res = urlparse(val) return bool(res.scheme and res.netloc and res.params == '')
[docs]def is_lsid(val): """Test if argument is a LSID.""" return is_urn(val) and lsid_regexp.match(val)
[docs]def is_urn(val): """Test if argument is an URN.""" res = urlparse(val) return bool(res.scheme == 'urn' and res.netloc == '' and res.path != '')
[docs]def is_ads(val): """Test if argument is an ADS bibliographic code.""" return ads_regexp.match(val)
[docs]def is_arxiv_post_2007(val): """Test if argument is a post-2007 arXiv ID.""" return arxiv_post_2007_regexp.match(val) \ or arxiv_post_2007_with_class_regexp.match(val)
[docs]def is_arxiv_pre_2007(val): """Test if argument is a pre-2007 arXiv ID.""" return arxiv_pre_2007_regexp.match(val)
[docs]def is_arxiv(val): """Test if argument is an arXiv ID. See and """ return is_arxiv_post_2007(val) or is_arxiv_pre_2007(val)
[docs]def is_pmid(val): """Test if argument is a PubMed ID. Warning: PMID are just integers, with no structure, so this function will say any integer is a PubMed ID """ return pmid_regexp.match(val)
[docs]def is_pmcid(val): """Test if argument is a PubMed Central ID.""" return pmcid_regexp.match(val)
[docs]def is_gnd(val): """Test if argument is a GND Identifier.""" if val.startswith(gnd_resolver_url): val = val[len(gnd_resolver_url):] return gnd_regexp.match(val)
[docs]def is_sra(val): """Test if argument is an SRA accession.""" return sra_regexp.match(val)
[docs]def is_bioproject(val): """Test if argument is a BioProject accession.""" return bioproject_regexp.match(val)
[docs]def is_biosample(val): """Test if argument is a BioSample accession.""" return biosample_regexp.match(val)
[docs]def is_ensembl(val): """Test if argument is an Ensembl accession.""" return ensembl_regexp.match(val)
[docs]def is_uniprot(val): """Test if argument is a UniProt accession.""" return uniprot_regexp.match(val)
[docs]def is_refseq(val): """Test if argument is a RefSeq accession.""" return refseq_regexp.match(val)
[docs]def is_genome(val): """Test if argument is a GenBank or RefSeq genome assembly accession.""" return genome_regexp.match(val)
PID_SCHEMES = [ ('doi', is_doi), ('ark', is_ark), ('handle', is_handle), ('purl', is_purl), ('lsid', is_lsid), ('urn', is_urn), ('ads', is_ads), ('arxiv', is_arxiv), ('pmcid', is_pmcid), ('isbn', is_isbn), ('issn', is_issn), ('orcid', is_orcid), ('isni', is_isni), ('ean13', is_ean13), ('ean8', is_ean8), ('istc', is_istc), ('gnd', is_gnd), ('url', is_url), ('pmid', is_pmid), ('sra', is_sra), ('bioproject', is_bioproject), ('biosample', is_biosample), ('ensembl', is_ensembl), ('uniprot', is_uniprot), ('refseq', is_refseq), ('genome', is_genome), ] """Definition of scheme name and associated test function. Order of list is important, as identifier scheme detection will test in the order given by this list.""" SCHEME_FILTER = [ ('ean8', ['gnd', 'pmid']), ('ean13', ['gnd', 'pmid']), ('isbn', ['gnd', 'pmid']), ('orcid', ['gnd', 'pmid']), ('isni', ['gnd', 'pmid']), ('issn', ['gnd', ]), ]
[docs]def detect_identifier_schemes(val): """Detect persistent identifier scheme for a given value. .. note:: Some schemes like PMID are very generic. """ schemes = [] for scheme, test in PID_SCHEMES: if test(val): schemes.append(scheme) for first, remove_schemes in SCHEME_FILTER: if first in schemes: schemes = list(filter(lambda x: x not in remove_schemes, schemes)) if 'handle' in schemes and 'url' in schemes \ and not val.startswith("") \ and not val.startswith(""): schemes = list(filter(lambda x: x != 'handle', schemes)) elif 'handle' in schemes and ('ark' in schemes or 'arxiv' in schemes): schemes = list(filter(lambda x: x != 'handle', schemes)) return schemes
[docs]def normalize_doi(val): """Normalize a DOI.""" m = doi_regexp.match(val) return
[docs]def normalize_handle(val): """Normalize a Handle identifier.""" m = handle_regexp.match(val) return
[docs]def normalize_ads(val): """Normalize an ADS bibliographic code.""" m = ads_regexp.match(val) return
[docs]def normalize_orcid(val): """Normalize an ORCID identifier.""" if val.startswith(orcid_url): val = val[len(orcid_url):] val = val.replace("-", "").replace(" ", "") return "-".join([val[0:4], val[4:8], val[8:12], val[12:16]])
[docs]def normalize_gnd(val): """Normalize a GND identifier.""" if val.startswith(gnd_resolver_url): val = val[len(gnd_resolver_url):] if val.lower().startswith("gnd:"): val = val[len("gnd:"):] return "gnd:{0}".format(val)
[docs]def normalize_pmid(val): """Normalize an PubMed ID.""" m = pmid_regexp.match(val) return
[docs]def normalize_arxiv(val): """Normalize an arXiv identifier.""" if not val.lower().startswith("arxiv:"): val = "arXiv:{0}".format(val) elif val[:6] != "arXiv:": val = "arXiv:{0}".format(val[6:]) # Normalize old identifiers to preferred scheme as specified by # # (i.e. arXiv:math.GT/0309136 -> arXiv:math/0309136) m = is_arxiv_pre_2007(val) if m and val = "".join(, 2, 4, 5)) if val += m = is_arxiv_post_2007(val) if m: val = 'arXiv:' + '.'.join(, 3)) if val += return val
def normalize_isbn(val): """Normalize an ISBN identifier.""" val = val.replace(' ', '').replace('-', '').strip().upper() return ISBN(val).hyphen() def normalize_issn(val): """Normalize an ISSN identifier.""" val = val.replace(' ', '').replace('-', '').strip().upper() return '{0}-{1}'.format(val[:4], val[4:])
[docs]def normalize_pid(val, scheme): """Normalize an identifier. E.g. doi:10.1234/foo and and 10.1234/foo will all be normalized to 10.1234/foo. """ if not val: return val if scheme == 'doi': return normalize_doi(val) elif scheme == 'handle': return normalize_handle(val) elif scheme == 'ads': return normalize_ads(val) elif scheme == 'pmid': return normalize_pmid(val) elif scheme == 'arxiv': return normalize_arxiv(val) elif scheme == 'orcid': return normalize_orcid(val) elif scheme == 'gnd': return normalize_gnd(val) elif scheme == 'isbn': return normalize_isbn(val) elif scheme == 'issn': return normalize_issn(val) return val
LANDING_URLS = { 'doi': u'{scheme}://{pid}', 'handle': u'{scheme}://{pid}', 'arxiv': u'{scheme}://{pid}', 'orcid': u'{scheme}://{pid}', 'pmid': u'{scheme}://{pid}', 'ads': u'{scheme}://{pid}', 'pmcid': u'{scheme}://{pid}', 'gnd': u'{scheme}://{pid}', 'urn': u'{scheme}://{pid}', 'sra': u'{scheme}://{pid}', 'bioproject': u'{scheme}://{pid}', 'biosample': u'{scheme}://{pid}', 'ensembl': u'{scheme}://{pid}', 'uniprot': u'{scheme}://{pid}', 'refseq': u'{scheme}://{pid}', 'genome': u'{scheme}://{pid}', } """URL generation configuration for the supported PID providers."""
[docs]def to_url(val, scheme, url_scheme='http'): """Convert a resolvable identifier into a URL for a landing page. :param val: The identifier's value. :param scheme: The identifier's scheme. :param url_scheme: Scheme to use for URL generation, 'http' or 'https'. :returns: URL for the identifier. .. versionadded:: 0.3.0 ``url_scheme`` used for URL generation. """ pid = normalize_pid(val, scheme) if scheme in LANDING_URLS: if scheme == 'gnd' and pid.startswith('gnd:'): pid = pid[len('gnd:'):] if scheme == 'urn' and not pid.lower().startswith('urn:nbn:'): return '' return LANDING_URLS[scheme].format(scheme=url_scheme, pid=pid) elif scheme in ['purl', 'url']: return pid return ''