Source code for grblc.search.gcn.scraper

import re
import os
import json
import glob2
import shutil
import tarfile
import requests
from .parser import (
    check_table,
    check_sentence,
    get_final_tables_txt,
    get_final_sentences_txt,
    get_final_txt,
    final_tables_to_csv,
    final_sentences_to_csv,
)

[docs]class Scraper: # TODO: We will have to change output_path to a non-default argument at the end. # (because I beleive the client will not want the output to be in the module.) def __init__(self, data_path="", output_path=""): """ """ self.set_data_path(data_path) self.set_output_path(output_path)
[docs] def set_output_path(self, path=""): """ Set output_path for this scraper object. """ # Set up new_output_path and there are three cases. # If no path is passed, we assume the output path will be the folder in the gcn_crawler if not path: new_output_path = "gcncc/output/" # If the path ends with a path separator. elif path.endswith(os.path.sep): new_output_path = path + "gcncc/output/" # If the path does not end with a path separator. else: new_output_path = path + os.path.sep + "gcncc/output/" # Move the old output folder to the new location. # Skip this part if there is no old output path. try: old_output_path = self.__output_path__ if os.path.exists(old_output_path): shutil.move(old_output_path, new_output_path) except: pass # Update output path. self.__output_path__ = new_output_path
[docs] def set_data_path(self, path=""): """ Set data_path for this scraper object. """ # Set up new_data_path and there are three cases. # If no path is passed, we assume the data path will be the folder in the gcn_crawler if not path: new_data_path = "gcncc/data/" # If the path ends with a path separator. elif path.endswith(os.path.sep): new_data_path = path + "gcncc/data/" # If the path does not end with a path separator. else: new_data_path = path + os.path.sep + "gcncc/data/" # Move the old output folder to the new location. # Skip this part if there is no old output path. try: old_data_path = self.__data_path__ if os.path.exists(old_data_path): shutil.move(old_data_path, new_data_path) except: pass # Update data path. self.__data_path__ = new_data_path
# TODO: check the usage of save argument
[docs] def grb_circulars(self, save=True): """ Downloads tar file from GCN website into a folder gcn3. """ # Get the url and https request. url = "https://gcn.gsfc.nasa.gov/gcn3/all_gcn_circulars.tar.gz" response = requests.get(url, stream=True) # TODO: Understnand this and comment. file = tarfile.open(fileobj=response.raw, mode="r|gz") file.extractall(path=self.__data_path__) return file
[docs] def load_gcn(self): """ Creates a list of all gcn files in gcn3 """ gcns = glob2.glob(self.__data_path__ + "gcn3/" + "*.gcn3") return gcns
# TODO: double check it is finding ALL of them there was one test case where previous function # returned more searches than this function # TODO: Check the usage of threading and echo arguments.
[docs] def scrape(self, threading=True, echo=None): """ Search for a GRB in the file """ self.grb_circulars() error = 0 circ_dict = {} fileErr = [] gcns = self.load_gcn() # TODO: Understand this and comment. for fileName in gcns: file = open(fileName, encoding="utf8", errors="ignore") file_search = file.read() # Search the whole text for any instance of a GRB matches = re.findall("GRB\s?\d{6}[A-Z]?(?!\d)", file_search, flags=re.IGNORECASE) matches = [l.strip("GRBSWgrbsw\n ") for l in matches] # TODO: Understnand this and comment. circ_dict[fileName.rsplit("/", 1)[-1]] = matches file.close() # Show if there are any problems while scraping. print("There were " + str(error) + " error(s) in the following file(s):") print(fileErr) # Write the dictionary to and output location with open(self.__data_path__ + "gcn_archive_circ_dict.json", "w") as f: f.write(json.dumps(circ_dict))
# Function that searches dictionary for GRB (Add keywords like 'subaru' later)
[docs] def grb_lookup(self, grb, *keywords): """ Look up a specific grb in a circular dictionary. Look for specific keywords in the gcn if the client provides ones. """ # Modify GRB if the length of it is smaller than 6 if len(grb) < 6: grb = "0" * (6 - len(grb)) + grb # Check if output path already exists. if not os.path.exists(self.__output_path__): os.mkdir(self.__output_path__) # Check if output path of this specific GRB already exists. if not os.path.exists(f"{self.__output_path__}{grb}/"): os.mkdir(f"{self.__output_path__}{grb}/") # Make sure keywords is a tuple or None. assert not keywords or isinstance(keywords, tuple) # Load the data. try: filepath = self.__data_path__ + "gcn_archive_circ_dict.json" with open(filepath) as file: circ_dict = json.load(file) file.close() except FileNotFoundError: raise FileNotFoundError('The "data" folder could be removed. Try scrape().') # Get the list of gcn that has the grb. gcn_list = [key for key, list_of_grbs in circ_dict.items() if grb in list_of_grbs] # The json object to store types, files, and check functions to reduce repetition categories = [ {"name": "all_gcn", "counter": 0, "file_name": f"{grb}_all_gcn.txt", "check": lambda x: True}, {"name": "table", "counter": 0, "file_name": f"{grb}_table.txt", "check": check_table}, {"name": "sentence", "counter": 0, "file_name": f"{grb}_sentences.txt", "check": check_sentence}, ] # Open the files and store them. for cat in categories: cat["file"] = open(f"{self.__output_path__}{grb}/{cat['file_name']}", "w") for gcn in gcn_list: # Open file in the list of GCNs and copy text to new txt file. grb_open = open( self.__data_path__ + "gcn3/" + gcn, "r", errors="ignore" ) # Added the ignore attribute or it will raise an UnicodeDecodeError grb_listing = grb_open.read() # Loop through categories and use the check function in each category to filter GCNs for cat in categories: if cat["check"](grb_listing): cat["file"].write( f"=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n\n{grb_listing}\n" ) cat["counter"] += 1 # Check if any keyword is in the text. for k in keywords: key_check = re.search(k, grb_listing, flags=re.IGNORECASE) # If a key is matched, stop looping through the keywords and write the text into the text file. if key_check: file = open(f"{self.__output_path__}{grb}/{grb}_{k}.txt", "w") file.write( "=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=\n\n%s\n" % grb_listing ) categories[0]["counter"] += 1 print(f"{categories[0]['counter']}. {gcn} for {k}\n") file.close() # Close the files and report the results. for cat in categories: cat["file"].close() # Get the data object from get_final_*_txt functions allData_tables = get_final_tables_txt(grb, self.__output_path__) allData_sentences = get_final_sentences_txt(grb, self.__output_path__) # Get the *_final.txt get_final_txt(grb, allData_tables, allData_sentences, self.__output_path__) return
# We will move these functions back to grb_lookup after they are finished.
[docs] def final_tables_to_csv(self, grb): final_tables_to_csv(grb, output_path=self.__output_path__)
[docs] def final_sentences_to_csv(self, grb): final_sentences_to_csv(grb, output_path=self.__output_path__)