Source code for grblc.search.ads.search

import requests, re, os, sys
from .config import read_apikey
from .ECHO import SynchronizedEcho
import concurrent.futures, warnings
from ads import SearchQuery


[docs]def getArticles(finds, threading=True, debug=False):
    """
    User function to create a single string containing seperated text bodies from a
    list of `ads.search.Article`'s.

    :param papers:
        A list of ADS articles to download.
    :type papers:
        :class:`list` of `ads.search.Article`
    :param threading:
        Boolean to specify the use of concurrency.
    :type threading:
        :class:`bool`
    :returns:
        String containing each GCN separated by a line.
    """
    papers = finds["articlelist"]
    GRB = finds["GRB"]
    if len(papers) == 0:
        return r"No articles found! ¯\(°_o)/¯"

    articlelist = []
    if threading:
        threads = min(30, len(papers))
        _wrapped_getArticle = lambda article: getArticle(articlelist, article, GRB, debug=debug)

        with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
            executor.map(_wrapped_getArticle, papers)
            executor.shutdown()
    else:
        articlelist = [getArticle(articlelist, paper, GRB, debug=debug) for paper in papers]

    if "gcn" in papers[0].bibcode.lower():
        result = "\n=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n\n".join(articlelist)
    else:
        result = articlelist

    ECHO(f"[{GRB}] {len(result)}/{len(papers)} saved.")
    return result


[docs]def prepareGRB(GRB):
    if GRB[-1].isalpha():
        finalchar = GRB[-1]
        allbutfinal = GRB[:-1]
    else:
        finalchar = None
        allbutfinal = GRB

    if len(allbutfinal) < 6:
        finalGRB = "0" * (6 - len(allbutfinal)) + allbutfinal
        if finalchar:
            finalGRB += finalchar
    else:
        finalGRB = GRB

    return finalGRB


[docs]def getGRBComboQuery(GRB):
    """
    Get the several versions of a GRB name that could come up in ADS searches.
    E.g., 010222A, 10222A, GRB010222A, GRB1022A

    :param GRB:
        The GRB to get name combinations of.
    :type GRB:
        :class:`str`
    :returns:
        String of GRB name combinations separated by "OR" for search in ADS.
    """

    return " OR ".join([f"{GRB}", f"GRB{GRB}"])


[docs]def additionalKeywords(keywords):
    """
    Convert keyword(s) to a string to use in an ADS query.

    :param keywords:
        Keywords to specifically search for in addition to the GRB.
    :type keywords:
        :class:`list`,`tuple`,`str`
    :returns:
        String of keyword(s) separated by an "AND" for use in an ADS query.
    """

    if not isinstance(keywords, (type(None), list, tuple)):
        keywords = (keywords,)

    if keywords:
        keywordquery = " AND ".join(keywords)
        query = f"full:({keywordquery})"
    else:
        query = ""

    return query


[docs]def gcnSearch(GRB, keywords=None, printlength=True, debug=False):
    """
    User function to find GCNs containing the inputted GRB and optional
    keywords

    :param GRB:
        GRB name; e.g., '010222' or '200205A'
    :type GRB:
        :class:`str`
    :param keywords:
        Keywords to specifically search for in addition to the GRB.
    :type keywords:
        :class:`list`,`tuple`,`str`
    :param printlength:
        Determines whether the user would like the number of articles found to be printed.
    :type printlength:
        :class:`bool`
    :returns:
        A list of `ads.search.Article`'s containing GCNs pertaining to GRB and optional
        keywords.
    """

    if keywords is not None:
        warnings.warn("Keywords aren't working correctly right now.", stacklevel=2)
    assert isinstance(GRB, str), "GRB is not of type string."
    query = f"bibstem:GCN {getGRBComboQuery(GRB)}"
    keywords = additionalKeywords(keywords)
    finds = list(SearchQuery(q=f"{query + keywords}", fl=["bibcode", "identifier"]))
    if debug:
        ECHO(f"[{GRB}] Query: {query + keywords}")
    if printlength:
        ECHO(f"[{GRB}] {len(finds)} candidates.")
    return finds


[docs]def litSearch(GRB, keywords=None, printlength=True, debug=False):
    """
    User function to find literature containing the inputted GRB and optional
    keywords

    :param GRB:
        GRB name; e.g., '010222' or '200205A'
    :type GRB:
        :class:`str`
    :param keywords:
        Keywords to specifically search for in addition to the GRB.
    :type keywords:
        :class:`list`,`tuple`,`str`
    :param printlength:
        Determines whether the user would like the number of articles found to be printed.
    :type printlength:
        :class:`bool`
    :returns:
        A list of `ads.search.Article`'s containing GCNs pertaining to GRB and optional
        keywords.
    """
    assert isinstance(GRB, str), "GRB is not of type string."
    GRB = prepareGRB(GRB)
    query = getGRBComboQuery(GRB)
    keywords = additionalKeywords(keywords)
    fullquery = f"title:{query} OR abstract:{query} OR keyword:{query} {keywords} -bibstem:GCN"
    finds = list(SearchQuery(q=fullquery, fl=["bibcode", "identifier", "title", "author", "year"], rows=100))
    if (printlength or debug) and len(finds) > 0:
        ECHO(f"[{GRB}] {len(finds)} found.")
    if debug:
        ECHO(f"[{GRB}] Query: '{fullquery}'")
        ECHO(f"Finds: {', '.join([find.bibcode for find in finds])}")

    return {"GRB": GRB, "articlelist": finds}


[docs]def getArticle(articlelist, article, GRB, firsttry=True, debug=False):
    """
    Download an article from arXiv or other sources.
    :param articlelist:
        The string list to append article texts to.
    :type articlelist:
        :class:`list`
    :param article:
        The ADS article to retrieve.
    :type article:
        :class:`ads.search.Article`
    :returns:
        Nothing. Side effect of appending text of article body to articlelist.
    Modified from https://github.com/andycasey/ads/blob/master/examples/monthly-institute-publications/stromlo.py#22
    """

    if debug:
        ECHO(f"[{GRB}] Retrieving {article.bibcode}")
    isGCN = "GCN" in article.bibcode
    header = {"Authorization": f"Bearer {read_apikey()}"}
    # Ask ADS to redirect us to the journal article.
    params = {"bibcode": article.bibcode}
    if isGCN:
        params["link_type"] = "EJOURNAL"
    else:
        params["link_type"] = "ESOURCE"

    if isGCN:
        url = requests.get("http://adsabs.harvard.edu/cgi-bin/nph-data_query", params=params).url
        q = requests.get(url)
    else:
        url = f"https://api.adsabs.harvard.edu/v1/resolver/{article.bibcode}/esource"
        q = requests.get(
            url,
            headers=header,
            allow_redirects=False,
        )
        if not q.ok:
            if debug:
                ECHO(f"[{GRB}] Pass 1: Error retrieving {article.bibcode} ({q.status_code}): https://ui.adsabs.harvard.edu/abs/{article.bibcode}/abstract.")
                q.raise_for_status()
                return
            else:
                return

        deserialized = q.json()

        pdf_header = {"user-agent": f"adsgrb/{__version__}"}
        try:
            records = deserialized["links"]["records"]
            for record in records:
                linktype = record["link_type"]
                link = record["url"]
                if "PDF" in linktype and not "doi.org" in link and not "$" in link:
                    # switch any arxiv url to export.arxiv.org so we don't get locked out
                    url = link.replace("arxiv.org", "export.arxiv.org")
                    q = requests.get(url, stream=True, headers=pdf_header)
                    break
                # record is guaranteed to be of length > 0
                elif record == records[-1]:
                    ECHO(f"[{GRB}] Could not find suitable link for {article.bibcode}. {link}")
                    return
        except:
            linktype = deserialized["link_type"]
            if "PDF" in linktype and not "doi.org" in link and not "$" in link:
                # switch any arxiv url to export.arxiv.org so we don't get locked out
                url = deserialized["link"].replace("arxiv.org", "export.arxiv.org")
                q = requests.get(url, stream=True, headers=pdf_header)
            else:
                ECHO(f"[{GRB}] Pass 2: No suitable link for {article.bibcode}. {link}")
                return

    if not q.ok:
        ECHO(f"[{GRB}] Pass 2: Error retrieving {article.bibcode} ({q.status_code}): {url}")
        if debug:
            q.raise_for_status()
            return
        else:
            return

    # Check if the journal has given back forbidden HTML.
    try:
        if "</html>" in q.content.lower() or not str(q.content):
            ECHO(f"[{GRB}] Pass 2: Error retrieving {article.bibcode} (200): {url}")
            if firsttry and "arxiv" in url:
                ECHO(f"[{GRB}] Pass 2: Trying again for {article.bibcode}")
                getArticle(articlelist, article, GRB, firsttry=False, debug=debug)
            else:
                return
    except:
        if "</html>" in q.text.lower() or not str(q.text):
            ECHO(f"[{GRB}] Pass 2: Error retrieving {article.bibcode} (200): {url}")
            return

    if isGCN:
        articlelist.append(q.text)
    else:
        articlelist.append([q.content, article.title, article.year, url])


ECHO = SynchronizedEcho()
major, minor1, minor2, release, serial = sys.version_info
readfile_kwargs = {"encoding": "utf-8"} if major >= 3 else {}


[docs]def readfile(filename):
    with open(filename, **readfile_kwargs) as fp:
        contents = fp.read()
    return contents


version_regex = re.compile('__version__ = "(.*?)"')
contents = readfile(os.path.join(os.path.dirname(os.path.abspath(__file__)), "__init__.py"))
__version__ = version_regex.findall(contents)[0]