SEC Log files
The Securities and Exchange Commission provides data on which SEC filings were accessed at which dates and times. The code below will access the SEC website, obtain data on internet traffic, and compile it within a Postgresql database. Let's go through this process step-by-step:
Let's start by writing a function that collects the links to all the log files available on the SEC website:
def get_links(self):
# Create a list of the dates that are available on SEC's website.
import requests
from selectolax.parser import HTMLParser
link_list = []
for year in range(self.beg_yr, self.end_yr + 1):
resp = requests.get(f'https://www.sec.gov/files/edgar{year}.html')
html = HTMLParser(resp.text)
selector = "a"
for node in html.css(selector):
link_list.append(node.attributes['href'])
print(f'Created a list of links for the period between {self.beg_yr} and {self.end_yr}.')
return link_list
Let's next write a function that creates the necessary SQL table. Once we are done with all the pieces, we can then collect all functions within a class.
def create_db(self):
# This function creates the tables necessary to hold the parsed data.
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlalchemy
db_psql = create_engine(XXXXXX)
with db_psql.begin() as conn:
# Create a new table name sec.logs **** This is not working for some strange reason ****:
command = text('''CREATE TABLE IF NOT EXISTS sec.logs (cik bigint, date timestamp without time zone, url_path character varying(500))''')
conn.execute(command)
class sec_logs:
def __init__(self, beg_yr = 2020, end_yr = 2022):
self.beg_yr = beg_yr
self.end_yr = end_yr
def conn_db(self):
from sqlalchemy import create_engine
import sqlalchemy
db_psql = create_engine(XXXXX)
conn = db_psql.connect()
return db_psql
def create_db(self):
# This function creates the tables necessary to hold the parsed data.
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlalchemy
db_psql = create_engine(XXXXXX)
with db_psql.begin() as conn:
# Create a new table name sec.logs **** This is not working for some strange reason ****:
command = text('''CREATE TABLE IF NOT EXISTS sec.logs (cik bigint, date timestamp without time zone, url_path character varying(500))''')
conn.execute(command)
def update_db(self):
# This function will update the database based on the new log files that are available.
pass
def get_links(self):
# Create a list of the dates that are available on SEC's website.
import requests
from selectolax.parser import HTMLParser
link_list = []
for year in range(self.beg_yr, self.end_yr + 1):
resp = requests.get(f'https://www.sec.gov/files/edgar{year}.html')
html = HTMLParser(resp.text)
selector = "a"
for node in html.css(selector):
link_list.append(node.attributes['href'])
print(f'Created a list of links for the period between {self.beg_yr} and {self.end_yr}.')
return link_list
def download_daily_log_data(self):
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import text
import sqlalchemy
import numpy as np
db_psql = create_engine(XXXX)
conn = db_psql.connect()
# Get a list of all links:
links = self.get_links()
for link in links:
# Download the log data:
print(f'Loading link {link}')
df = pd.read_csv(link)
df = df[df.uri_path.notnull()]
# Identify the CIK code for each log entry:
df['date'] = pd.to_datetime(df['_time'], format ='mixed')
df['cik'] = df['uri_path'].str.extract(r'((?<=data/)\d{4,20}(?=/))').fillna(0).astype(np.int64)
df = df.rename(columns = {'uri_path': 'url_path'})
df = df.reindex(columns = ['cik', 'date', 'url_path'])
df.to_sql(name = 'logs', con = db_psql, schema = 'temp', if_exists = 'replace', index = False, dtype={"cik": sqlalchemy.BigInteger, "date": sqlalchemy.DateTime, "url_path": sqlalchemy.VARCHAR(500)})
with db_psql.begin() as conn:
# Delete new_events table if it exists:
query = text('DROP TABLE IF EXISTS temp.new_logs')
conn.execute(query)
# Identify the new events that do not exists in the database:
query = text('CREATE TABLE temp.new_logs as SELECT DISTINCT * FROM temp.logs b WHERE NOT EXISTS (SELECT * FROM sec.logs a WHERE a.date = b.date and a.url_path = b.url_path)')
conn.execute(query)
# Update the events table with the new events:
query = text('INSERT INTO sec.logs SELECT * FROM temp.new_logs')
conn.execute(query)
return print(df[:5])
def link_db_compustat(self):
# Create a mapping between CIKs and Compustat.
pass
if __name__ == '__main__':
scraper = sec_logs(beg_yr = 2020, end_yr = 2021)
scraper.download_daily_log_data()