link

AY

SEC Log files

The Securities and Exchange Commission provides data on which SEC filings were accessed at which dates and times. The code below will access the SEC website, obtain data on internet traffic, and compile it within a Postgresql database. Let's go through this process step-by-step:

Let's start by writing a function that collects the links to all the log files available on the SEC website:

    def get_links(self):
        # Create a list of the dates that are available on SEC's website.
        import requests
        from selectolax.parser import HTMLParser

        link_list = []
        for year in range(self.beg_yr, self.end_yr + 1):    
            resp = requests.get(f'https://www.sec.gov/files/edgar{year}.html')
            html = HTMLParser(resp.text)

            selector = "a"
            for node in html.css(selector):    
                link_list.append(node.attributes['href'])
        print(f'Created a list of links for the period between {self.beg_yr} and {self.end_yr}.')
        return link_list

Let's next write a function that creates the necessary SQL table. Once we are done with all the pieces, we can then collect all functions within a class.

    def create_db(self):
        # This function creates the tables necessary to hold the parsed data.
        from sqlalchemy import create_engine
        from sqlalchemy import text
        import sqlalchemy

        db_psql = create_engine(XXXXXX)

        with db_psql.begin() as conn:
            # Create a new table name sec.logs **** This is not working for some strange reason ****:
            command = text('''CREATE TABLE IF NOT EXISTS sec.logs (cik bigint, date timestamp without time zone, url_path character varying(500))''')
            conn.execute(command)
class sec_logs:
    def __init__(self, beg_yr = 2020, end_yr = 2022):
        self.beg_yr = beg_yr
        self.end_yr = end_yr

    def conn_db(self):
        from sqlalchemy import create_engine
        import sqlalchemy
        db_psql = create_engine(XXXXX)
        conn = db_psql.connect()        
        return db_psql

    def create_db(self):
        # This function creates the tables necessary to hold the parsed data.
        from sqlalchemy import create_engine
        from sqlalchemy import text
        import sqlalchemy

        db_psql = create_engine(XXXXXX)

        with db_psql.begin() as conn:
            # Create a new table name sec.logs **** This is not working for some strange reason ****:
            command = text('''CREATE TABLE IF NOT EXISTS sec.logs (cik bigint, date timestamp without time zone, url_path character varying(500))''')
            conn.execute(command)

    def update_db(self):
        # This function will update the database based on the new log files that are available.
        pass

    def get_links(self):
        # Create a list of the dates that are available on SEC's website.
        import requests
        from selectolax.parser import HTMLParser

        link_list = []
        for year in range(self.beg_yr, self.end_yr + 1):    
            resp = requests.get(f'https://www.sec.gov/files/edgar{year}.html')
            html = HTMLParser(resp.text)

            selector = "a"
            for node in html.css(selector):    
                link_list.append(node.attributes['href'])
        print(f'Created a list of links for the period between {self.beg_yr} and {self.end_yr}.')
        return link_list

    def download_daily_log_data(self):
        import pandas as pd
        from sqlalchemy import create_engine
        from sqlalchemy import text
        import sqlalchemy
        import numpy as np
        db_psql = create_engine(XXXX)
        conn = db_psql.connect()

        # Get a list of all links:
        links = self.get_links()

        for link in links:
            # Download the log data:
            print(f'Loading link {link}')
            df = pd.read_csv(link)
            df = df[df.uri_path.notnull()]            

            # Identify the CIK code for each log entry:
            df['date'] = pd.to_datetime(df['_time'], format ='mixed')
            df['cik'] = df['uri_path'].str.extract(r'((?<=data/)\d{4,20}(?=/))').fillna(0).astype(np.int64)    
            df = df.rename(columns = {'uri_path': 'url_path'})
            df = df.reindex(columns = ['cik', 'date', 'url_path']) 

            df.to_sql(name = 'logs', con = db_psql, schema = 'temp', if_exists = 'replace', index = False, dtype={"cik": sqlalchemy.BigInteger, "date": sqlalchemy.DateTime, "url_path": sqlalchemy.VARCHAR(500)})

            with db_psql.begin() as conn:
                # Delete new_events table if it exists:
                query = text('DROP TABLE IF EXISTS temp.new_logs')
                conn.execute(query)

                # Identify the new events that do not exists in the database:
                query = text('CREATE TABLE temp.new_logs as SELECT DISTINCT * FROM temp.logs b WHERE NOT EXISTS (SELECT * FROM sec.logs a WHERE a.date = b.date and a.url_path = b.url_path)')
                conn.execute(query)

                # Update the events table with the new events:
                query = text('INSERT INTO sec.logs SELECT * FROM temp.new_logs')
                conn.execute(query)
        return print(df[:5])

    def link_db_compustat(self):
        # Create a mapping between CIKs and Compustat.
        pass

if __name__ == '__main__':
    scraper = sec_logs(beg_yr = 2020, end_yr = 2021)

    scraper.download_daily_log_data()