Source code for idutils.validators
# -*- coding: utf-8 -*-
#
# This file is part of IDUtils
# Copyright (C) 2024 CERN.
# Copyright (C) 2025 Will Riley.
#
# IDUtils is free software; you can redistribute it and/or modify
# it under the terms of the Revised BSD License; see LICENSE file for
# more details.
#
# In applying this license, CERN does not waive the privileges and immunities
# granted to it by virtue of its status as an Intergovernmental Organization
# or submit itself to any jurisdiction.
"""Utility file containing ID validators."""
import unicodedata
from urllib.parse import urlparse
from .utils import *
from .utils import _convert_x_to_10
[docs]
def is_isbn(val):
"""Test if argument is an ISBN-10 or ISBN-13 number."""
if is_isbn10(val) or is_isbn13(val):
if val[0:3] in ["978", "979"] or not is_ean13(val):
return True
return False
[docs]
def is_issn(val):
"""Test if argument is an ISSN number."""
try:
val = val.replace("-", "").replace(" ", "").upper()
if len(val) != 8:
return False
r = sum([(8 - i) * (_convert_x_to_10(x)) for i, x in enumerate(val)])
return not (r % 11)
except ValueError:
return False
[docs]
def is_istc(val):
"""Test if argument is a International Standard Text Code.
See http://www.istc-international.org/html/about_structure_syntax.aspx
"""
val = val.replace("-", "").replace(" ", "").upper()
if len(val) != 16:
return False
sequence = [11, 9, 3, 1]
try:
r = sum([int(x, 16) * sequence[i % 4] for i, x in enumerate(val[:-1])])
ck = hex(r % 16)[2:].upper()
return ck == val[-1]
except ValueError:
return False
[docs]
def is_doi(val):
"""Test if argument is a DOI."""
return doi_regexp.match(val)
[docs]
def is_handle(val):
"""Test if argument is a Handle.
Note, DOIs are also handles, and handle are very generic so they will also
match e.g. any URL your parse.
"""
return handle_regexp.match(val) and not is_swh(val)
[docs]
def is_ean8(val):
"""Test if argument is a International Article Number (EAN-8)."""
if len(val) != 8:
return False
sequence = [3, 1]
try:
r = sum([int(x) * sequence[i % 2] for i, x in enumerate(val[:-1])])
ck = (10 - r % 10) % 10
return ck == int(val[-1])
except ValueError:
return False
[docs]
def is_ean13(val):
"""Test if argument is a International Article Number (EAN-13)."""
if len(val) != 13:
return False
sequence = [1, 3]
try:
r = sum([int(x) * sequence[i % 2] for i, x in enumerate(val[:-1])])
ck = (10 - r % 10) % 10
return ck == int(val[-1])
except ValueError:
return False
[docs]
def is_ean(val):
"""Test if argument is a International Article Number (EAN-13 or EAN-8).
See http://en.wikipedia.org/wiki/International_Article_Number_(EAN).
"""
return is_ean13(val) or is_ean8(val)
[docs]
def is_isni(val):
"""Test if argument is an International Standard Name Identifier."""
val = val.replace("-", "").replace(" ", "").upper()
if len(val) != 16:
return False
try:
r = 0
for x in val[:-1]:
r = (r + int(x)) * 2
ck = (12 - r % 11) % 11
return ck == _convert_x_to_10(val[-1])
except ValueError:
return False
[docs]
def is_orcid(val):
"""Test if argument is an ORCID ID.
See http://support.orcid.org/knowledgebase/
articles/116780-structure-of-the-orcid-identifier
"""
for orcid_url in orcid_urls:
if val.startswith(orcid_url):
val = val[len(orcid_url) :]
break
val = val.replace("-", "").replace(" ", "")
if is_isni(val):
val = int(val[:-1], 10) # Remove check digit and convert to int.
return any(start <= val <= end for start, end in orcid_isni_ranges)
return False
def is_ark(val):
"""Test if argument is an ARK."""
res = urlparse(val)
return ark_suffix_regexp.match(val) or (
res.scheme == "http"
and res.netloc != ""
and
# Note res.path includes leading slash, hence [1:] to use same reexp
ark_suffix_regexp.match(res.path[1:])
and res.params == ""
)
[docs]
def is_purl(val):
"""Test if argument is a PURL."""
res = urlparse(val)
purl_netlocs = [
"purl.org",
"purl.oclc.org",
"purl.net",
"purl.com",
"purl.fdlp.gov",
]
return (
res.scheme in ["http", "https"]
and res.netloc in purl_netlocs
and res.path != ""
)
[docs]
def is_url(val):
"""Test if argument is a URL."""
res = urlparse(val)
return bool(res.scheme and res.netloc)
[docs]
def is_lsid(val):
"""Test if argument is a LSID."""
return is_urn(val) and lsid_regexp.match(val)
[docs]
def is_urn(val):
"""Test if argument is an URN."""
res = urlparse(val)
return bool(res.scheme == "urn" and res.netloc == "" and res.path != "")
[docs]
def is_ads(val):
"""Test if argument is an ADS bibliographic code."""
val = unicodedata.normalize("NFKD", val)
return ads_regexp.match(val)
[docs]
def is_arxiv_post_2007(val):
"""Test if argument is a post-2007 arXiv ID."""
return arxiv_post_2007_regexp.match(val) or arxiv_post_2007_with_class_regexp.match(
val
)
[docs]
def is_arxiv_pre_2007(val):
"""Test if argument is a pre-2007 arXiv ID."""
return arxiv_pre_2007_regexp.match(val)
[docs]
def is_arxiv(val):
"""Test if argument is an arXiv ID.
See http://arxiv.org/help/arxiv_identifier and
http://arxiv.org/help/arxiv_identifier_for_services.
"""
return is_arxiv_post_2007(val) or is_arxiv_pre_2007(val)
def is_hal(val):
"""Test if argument is a HAL identifier.
See (https://hal.archives-ouvertes.fr)
"""
return hal_regexp.match(val)
[docs]
def is_pmid(val):
"""Test if argument is a PubMed ID.
Warning: PMID are just integers, with no structure, so this function will
say any integer is a PubMed ID
"""
return pmid_regexp.match(val)
[docs]
def is_pmcid(val):
"""Test if argument is a PubMed Central ID."""
return pmcid_regexp.match(val)
[docs]
def is_gnd(val):
"""Test if argument is a GND Identifier."""
return gnd_regexp.match(val)
[docs]
def is_sra(val):
"""Test if argument is an SRA accession."""
return sra_regexp.match(val)
[docs]
def is_bioproject(val):
"""Test if argument is a BioProject accession."""
return bioproject_regexp.match(val)
[docs]
def is_biosample(val):
"""Test if argument is a BioSample accession."""
return biosample_regexp.match(val)
[docs]
def is_ensembl(val):
"""Test if argument is an Ensembl accession."""
return ensembl_regexp.match(val)
[docs]
def is_uniprot(val):
"""Test if argument is a UniProt accession."""
return uniprot_regexp.match(val)
[docs]
def is_refseq(val):
"""Test if argument is a RefSeq accession."""
return refseq_regexp.match(val)
[docs]
def is_genome(val):
"""Test if argument is a GenBank or RefSeq genome assembly accession."""
return genome_regexp.match(val)
[docs]
def is_geo(val):
"""Test if argument is a Gene Expression Omnibus (GEO) accession."""
return geo_regexp.match(val)
[docs]
def is_arrayexpress_array(val):
"""Test if argument is an ArrayExpress array accession."""
return arrayexpress_array_regexp.match(val)
[docs]
def is_arrayexpress_experiment(val):
"""Test if argument is an ArrayExpress experiment accession."""
return arrayexpress_experiment_regexp.match(val)
def is_ascl(val):
"""Test if argument is a ASCL accession."""
return ascl_regexp.match(val)
def is_rfc3987_ipath_absolute(val):
"""Test if the argument is an <ipath-absolute> from RFC 3987."""
return rfc3987_reg_exps["ipath_absolute"].fullmatch(val) is not None
def is_rfc3987_iri(val):
"""Test if the argment is an <iri> from RFC 3987."""
return rfc3987_reg_exps["iri"].fullmatch(val) is not None
def is_swh(val):
"""Test if argument is a Software Heritage identifier.
https://docs.softwareheritage.org/devel/swh-model/persistent-identifiers.html#syntax
"""
m = swh_before_qualifiers_regexp.match(val)
if m is not None:
qualifiers = m.group("qualifiers")
if qualifiers is None:
return True
else:
qualifiers = str(qualifiers)[1:] # remove the first semi-colon
qualifiers = qualifiers.split(";") # split by semi-colon
for qualifier in qualifiers:
m = swh_qualifier_values_regexp.match(qualifier)
if m is None:
return False
else:
qualifier_dict = m.groupdict()
# origin value must be IRI according to RFC 3987
origin_value = qualifier_dict["origin_value"]
if origin_value is not None and not is_rfc3987_iri(
str(origin_value)
):
return False
# path value must be an <ipath-absolute>
path_value = qualifier_dict["path_value"]
if path_value is not None and not is_rfc3987_ipath_absolute(
str(path_value)
):
return False
return True
return False
def is_ror(val):
"""Test if argument is a ROR id."""
return ror_regexp.match(val)
def is_viaf(val):
"""Test if argument is a VIAF id."""
for viaf_url in viaf_urls:
if val.startswith(viaf_url):
return True
res = viaf_regexp.match(val)
if res:
return viaf_regexp.match(val).group() == val
else:
return False
def is_email(val):
"""Test if argument looks like an email address.
Note this test is designed to distinguish an email from other identifier
schemes only. It does not imply a valid address / domain etc.
"""
return email_regexp.match(val)
def is_sha1(val):
"""Test if argument is a valid SHA-1 (hex) hash."""
return sha1_regexp.match(val)