Source code for grblc.search.gcn.scraper

import re
import os
import json
import glob2
import shutil
import tarfile
import requests
from .parser import (
    check_table,
    check_sentence,
    get_final_tables_txt,
    get_final_sentences_txt,
    get_final_txt,
    final_tables_to_csv,
    final_sentences_to_csv,
)

[docs]class Scraper:

    # TODO: We will have to change output_path to a non-default argument at the end.
    #       (because I beleive the client will not want the output to be in the module.)
    def __init__(self, data_path="", output_path=""):
        """ """
        self.set_data_path(data_path)
        self.set_output_path(output_path)

[docs]    def set_output_path(self, path=""):
        """
        Set output_path for this scraper object.
        """

        # Set up new_output_path and there are three cases.
        # If no path is passed, we assume the output path will be the folder in the gcn_crawler
        if not path:
            new_output_path = "gcncc/output/"

        # If the path ends with a path separator.
        elif path.endswith(os.path.sep):
            new_output_path = path + "gcncc/output/"

        # If the path does not end with a path separator.
        else:
            new_output_path = path + os.path.sep + "gcncc/output/"

        # Move the old output folder to the new location.
        # Skip this part if there is no old output path.
        try:
            old_output_path = self.__output_path__
            if os.path.exists(old_output_path):
                shutil.move(old_output_path, new_output_path)
        except:
            pass

        # Update output path.
        self.__output_path__ = new_output_path

[docs]    def set_data_path(self, path=""):
        """
        Set data_path for this scraper object.
        """

        # Set up new_data_path and there are three cases.
        # If no path is passed, we assume the data path will be the folder in the gcn_crawler
        if not path:
            new_data_path = "gcncc/data/"

        # If the path ends with a path separator.
        elif path.endswith(os.path.sep):
            new_data_path = path + "gcncc/data/"

        # If the path does not end with a path separator.
        else:
            new_data_path = path + os.path.sep + "gcncc/data/"

        # Move the old output folder to the new location.
        # Skip this part if there is no old output path.
        try:
            old_data_path = self.__data_path__
            if os.path.exists(old_data_path):
                shutil.move(old_data_path, new_data_path)
        except:
            pass

        # Update data path.
        self.__data_path__ = new_data_path

    # TODO: check the usage of save argument
[docs]    def grb_circulars(self, save=True):
        """
        Downloads tar file from GCN website into a folder gcn3.
        """

        # Get the url and https request.
        url = "https://gcn.gsfc.nasa.gov/gcn3/all_gcn_circulars.tar.gz"
        response = requests.get(url, stream=True)

        # TODO: Understnand this and comment.
        file = tarfile.open(fileobj=response.raw, mode="r|gz")
        file.extractall(path=self.__data_path__)

        return file

[docs]    def load_gcn(self):
        """
        Creates a list of all gcn files in gcn3
        """
        gcns = glob2.glob(self.__data_path__ + "gcn3/" + "*.gcn3")
        return gcns

    # TODO: double check it is finding ALL of them there was one test case where previous function
    #       returned more searches than this function
    # TODO: Check the usage of threading and echo arguments.
[docs]    def scrape(self, threading=True, echo=None):
        """
        Search for a GRB in the file
        """

        self.grb_circulars()
        error = 0
        circ_dict = {}
        fileErr = []
        gcns = self.load_gcn()

        # TODO: Understand this and comment.
        for fileName in gcns:

            file = open(fileName, encoding="utf8", errors="ignore")
            file_search = file.read()

            # Search the whole text for any instance of a GRB
            matches = re.findall("GRB\s?\d{6}[A-Z]?(?!\d)", file_search, flags=re.IGNORECASE)
            matches = [l.strip("GRBSWgrbsw\n ") for l in matches]

            #  TODO: Understnand this and comment.
            circ_dict[fileName.rsplit("/", 1)[-1]] = matches
            file.close()

        # Show if there are any problems while scraping.
        print("There were " + str(error) + " error(s) in the following file(s):")
        print(fileErr)

        # Write the dictionary to and output location
        with open(self.__data_path__ + "gcn_archive_circ_dict.json", "w") as f:
            f.write(json.dumps(circ_dict))

    # Function that searches dictionary for GRB (Add keywords like 'subaru' later)
[docs]    def grb_lookup(self, grb, *keywords):
        """
        Look up a specific grb in a circular dictionary.
        Look for specific keywords in the gcn if the client provides ones.
        """

        # Modify GRB if the length of it is smaller than 6
        if len(grb) < 6:
            grb = "0" * (6 - len(grb)) + grb

        # Check if output path already exists.
        if not os.path.exists(self.__output_path__):
            os.mkdir(self.__output_path__)

        # Check if output path of this specific GRB already exists.
        if not os.path.exists(f"{self.__output_path__}{grb}/"):
            os.mkdir(f"{self.__output_path__}{grb}/")

        # Make sure keywords is a tuple or None.
        assert not keywords or isinstance(keywords, tuple)

        # Load the data.
        try:
            filepath = self.__data_path__ + "gcn_archive_circ_dict.json"
            with open(filepath) as file:
                circ_dict = json.load(file)
                file.close()
        except FileNotFoundError:
            raise FileNotFoundError('The "data" folder could be removed. Try scrape().')

        # Get the list of gcn that has the grb.
        gcn_list = [key for key, list_of_grbs in circ_dict.items() if grb in list_of_grbs]

        # The json object to store types, files, and check functions to reduce repetition
        categories = [
            {"name": "all_gcn", "counter": 0, "file_name": f"{grb}_all_gcn.txt", "check": lambda x: True},
            {"name": "table", "counter": 0, "file_name": f"{grb}_table.txt", "check": check_table},
            {"name": "sentence", "counter": 0, "file_name": f"{grb}_sentences.txt", "check": check_sentence},
        ]

        # Open the files and store them.
        for cat in categories:
            cat["file"] = open(f"{self.__output_path__}{grb}/{cat['file_name']}", "w")

        for gcn in gcn_list:

            # Open file in the list of GCNs and copy text to new txt file.
            grb_open = open(
                self.__data_path__ + "gcn3/" + gcn, "r", errors="ignore"
            )  # Added the ignore attribute or it will raise an UnicodeDecodeError
            grb_listing = grb_open.read()

            # Loop through categories and use the check function in each category to filter GCNs
            for cat in categories:
                if cat["check"](grb_listing):
                    cat["file"].write(
                        f"=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n\n{grb_listing}\n"
                    )
                    cat["counter"] += 1

            # Check if any keyword is in the text.
            for k in keywords:

                key_check = re.search(k, grb_listing, flags=re.IGNORECASE)

                # If a key is matched, stop looping through the keywords and write the text into the text file.
                if key_check:
                    file = open(f"{self.__output_path__}{grb}/{grb}_{k}.txt", "w")
                    file.write(
                        "=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n\n%s\n" % grb_listing
                    )

                    categories[0]["counter"] += 1
                    print(f"{categories[0]['counter']}. {gcn} for {k}\n")
                    file.close()

        # Close the files and report the results.
        for cat in categories:
            cat["file"].close()

        # Get the data object from get_final_*_txt functions
        allData_tables = get_final_tables_txt(grb, self.__output_path__)
        allData_sentences = get_final_sentences_txt(grb, self.__output_path__)

        # Get the *_final.txt
        get_final_txt(grb, allData_tables, allData_sentences, self.__output_path__)
        return 

    # We will move these functions back to grb_lookup after they are finished.
[docs]    def final_tables_to_csv(self, grb):
        final_tables_to_csv(grb, output_path=self.__output_path__)

[docs]    def final_sentences_to_csv(self, grb):
        final_sentences_to_csv(grb, output_path=self.__output_path__)