Source code for idutils.normalizers

# -*- coding: utf-8 -*-
#
# This file is part of IDUtils
# Copyright (C) 2024 CERN.
#
# IDUtils is free software; you can redistribute it and/or modify
# it under the terms of the Revised BSD License; see LICENSE file for
# more details.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.

"""ID normalizer helper functions."""

import unicodedata

import isbnlib

from .proxies import custom_schemes_registry
from .utils import *
from .validators import is_arxiv_post_2007, is_arxiv_pre_2007


[docs] def normalize_doi(val): """Normalize a DOI.""" m = doi_regexp.match(val) return m.group(2)
[docs] def normalize_handle(val): """Normalize a Handle identifier.""" m = handle_regexp.match(val) return m.group(2)
[docs] def normalize_ads(val): """Normalize an ADS bibliographic code.""" val = unicodedata.normalize("NFKD", val) m = ads_regexp.match(val) return m.group(2)
[docs] def normalize_orcid(val): """Normalize an ORCID identifier.""" for orcid_url in orcid_urls: if val.startswith(orcid_url): val = val[len(orcid_url) :] break val = val.replace("-", "").replace(" ", "") return "-".join([val[0:4], val[4:8], val[8:12], val[12:16]])
[docs] def normalize_gnd(val): """Normalize a GND identifier.""" m = gnd_regexp.match(val) return f"gnd:{m.group(2)}"
def normalize_urn(val): """Normalize a URN.""" if val.startswith(urn_resolver_url): val = val[len(urn_resolver_url) :] if val.lower().startswith("urn:"): val = val[len("urn:") :] return "urn:{0}".format(val)
[docs] def normalize_pmid(val): """Normalize a PubMed ID.""" m = pmid_regexp.match(val) return m.group(2)
[docs] def normalize_arxiv(val): """Normalize an arXiv identifier.""" if not val.lower().startswith("arxiv:"): val = "arXiv:{0}".format(val) elif val[:6] != "arXiv:": val = "arXiv:{0}".format(val[6:]) # Normalize old identifiers to preferred scheme as specified by # http://arxiv.org/help/arxiv_identifier_for_services # (i.e. arXiv:math.GT/0309136 -> arXiv:math/0309136) m = is_arxiv_pre_2007(val) if m and m.group(3): val = "".join(m.group(1, 2, 4, 5)) if m.group(6): val += m.group(6) m = is_arxiv_post_2007(val) if m: val = "arXiv:" + ".".join(m.group(2, 3)) if m.group(4): val += m.group(4) return val
def normalize_hal(val): """Normalize a HAL identifier.""" val = val.replace(" ", "").lower().replace("hal:", "") return val def normalize_isbn(val): """Normalize an ISBN identifier. Also converts ISBN10 to ISBN13. """ if is_isbn10(val): val = isbnlib.to_isbn13(val) return isbnlib.mask(isbnlib.canonical(val)) def normalize_issn(val): """Normalize an ISSN identifier.""" val = val.replace(" ", "").replace("-", "").strip().upper() return "{0}-{1}".format(val[:4], val[4:]) def normalize_ror(val): """Normalize a ROR.""" m = ror_regexp.match(val) return m.group(1) def normalize_viaf(val): """Normalize a VIAF identifier.""" for viaf_url in viaf_urls: if val.startswith(viaf_url): val = val[len(viaf_url) :] break if val.lower().startswith("viaf:"): val = val[len("viaf:") :] return "viaf:{0}".format(val)
[docs] def normalize_pid(val, scheme): """Normalize an identifier. E.g. doi:10.1234/foo and http://dx.doi.org/10.1234/foo and 10.1234/foo will all be normalized to 10.1234/foo. """ if not val: return val if scheme == "doi": return normalize_doi(val) elif scheme == "handle": return normalize_handle(val) elif scheme == "ads": return normalize_ads(val) elif scheme == "pmid": return normalize_pmid(val) elif scheme == "arxiv": return normalize_arxiv(val) elif scheme == "orcid": return normalize_orcid(val) elif scheme == "gnd": return normalize_gnd(val) elif scheme == "isbn": return normalize_isbn(val) elif scheme == "issn": return normalize_issn(val) elif scheme == "hal": return normalize_hal(val) elif scheme == "ror": return normalize_ror(val) elif scheme == "urn": return normalize_urn(val) elif scheme == "viaf": return normalize_viaf(val) else: for custom_scheme, normalizer in custom_schemes_registry().pick_scheme_key( "normalizer" ): if scheme == custom_scheme: return normalizer(val) return val
IDUTILS_LANDING_URLS = { "doi": "{scheme}://doi.org/{pid}", "handle": "{scheme}://hdl.handle.net/{pid}", "arxiv": "{scheme}://arxiv.org/abs/{pid}", "ascl": "{scheme}://ascl.net/{pid}", "orcid": "{scheme}://orcid.org/{pid}", "pmid": "{scheme}://pubmed.ncbi.nlm.nih.gov/{pid}/", "pmcid": "{scheme}://pmc.ncbi.nlm.nih.gov/articles/{pid}/", "ads": "{scheme}://ui.adsabs.harvard.edu/#abs/{pid}", "gnd": "{scheme}://d-nb.info/gnd/{pid}", "urn": "{scheme}://nbn-resolving.org/{pid}", "sra": "{scheme}://www.ebi.ac.uk/ena/data/view/{pid}", "bioproject": "{scheme}://www.ebi.ac.uk/ena/data/view/{pid}", "biosample": "{scheme}://www.ebi.ac.uk/ena/data/view/{pid}", "ensembl": "{scheme}://www.ensembl.org/id/{pid}", "uniprot": "{scheme}://purl.uniprot.org/uniprot/{pid}", "refseq": "{scheme}://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val={pid}", "genome": "{scheme}://www.ncbi.nlm.nih.gov/assembly/{pid}", "geo": "{scheme}://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc={pid}", "arrayexpress_array": "{scheme}://www.ebi.ac.uk/arrayexpress/arrays/{pid}", "arrayexpress_experiment": "{scheme}://www.ebi.ac.uk/arrayexpress/experiments/{pid}", "hal": "{scheme}://hal.archives-ouvertes.fr/{pid}", "swh": "{scheme}://archive.softwareheritage.org/{pid}", "ror": "{scheme}://ror.org/{pid}", "viaf": "{scheme}://viaf.org/viaf/{pid}", } """URL generation configuration for the supported PID providers."""
[docs] def to_url(val, scheme, url_scheme="http"): """Convert a resolvable identifier into a URL for a landing page. :param val: The identifier's value. :param scheme: The identifier's scheme. :param url_scheme: Scheme to use for URL generation, 'http' or 'https'. :returns: URL for the identifier. .. versionadded:: 0.3.0 ``url_scheme`` used for URL generation. """ pid = normalize_pid(val, scheme) landing_urls = IDUTILS_LANDING_URLS if scheme in landing_urls: if scheme == "gnd" and pid.startswith("gnd:"): pid = pid[len("gnd:") :] if scheme == "urn" and not pid.lower().startswith("urn:nbn:"): return "" if scheme == "ascl": pid = val.split(":")[1] if scheme == "viaf" and pid.startswith("viaf:"): pid = pid[len("viaf:") :] url_scheme = "https" return landing_urls[scheme].format(scheme=url_scheme, pid=pid) elif scheme in ["purl", "url"]: return pid else: for custom_scheme, url_generator in custom_schemes_registry().pick_scheme_key( "url_generator" ): if scheme == custom_scheme: return url_generator(url_scheme, pid) return ""