Module QueryWRDS
Expand source code
# MIT License
#
# Copyright (c) 2023 Andrew Maurice Perry
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import os
import wrds
import numpy as np
import pandas as pd
import datetime
import sqlalchemy
from cprint import *
from pandas.tseries.offsets import *
import time
from pympler import asizeof
import pathlib
from py_functions import wavg, winsorize
import subprocess
import warnings
warnings.simplefilter(action = 'ignore', category = RuntimeWarning)
np.seterr(divide='ignore')
class QueryWRDS:
# TODO:
# (1) add more detail to string repr
# (2) setup update reminder if information is over a year out of date
# (3) blocking so it will work on other computers
def __init__(self, WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3) -> None:
"""
Initalizers the QueryWRDS class. During this process all of the files need from
WRDS are downloaded to create a local SQL database. Additionally, combined files
are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These
tables contain all of the variables in the stock file, names file, and delisting
file. Additionally, CCM file is created; this
file contain the most commonly used CRSP varaiables and the Compustat variables
used to make the standard Fama-French anomaly characteristics, the standard
anomaly characteristics are created
Parameters
___________
WRDS_username: Personal WRDS username.
local_db_path: default = None; Location to create and read from the local SQL database.
update_all_tables: default = False; If true the local database is deleted and recreated
by downloading form WRDS.
tables_to_update: default = []; List of tables to update from WRDS.
update_tolerance: default = 3; Number of quarters the tables can be out of date.
Note
_____
If no 'local_db_path' is given then the WRDS database is created in the current directory
Note
_____
The table names in the local SQL database mirror those found on WRDS but with underscores
replcaing periods. Thus, when updating tables need to use the local names of the tables
(i.e.) CRSP_MSF instead of CRSP.MSF
Note
_____
Anomally characterisitcs created:
* bm: Book-to-Market
* pe: Price-to-Earnings
* cfp: Cashflow-to-Price
* inv: Investment
* op: Operating Profitablility
"""
# list of tables from WRDS to make local
WRDS_TABLES = ['FF.FACTORS_DAILY', # FamaFrench Factors daily (used for risk-free rate)
'FF.FACTORS_MONTHLY', # FamaFrench Factors monthly (used for risk-free rate)
'CRSP.CCMXPF_LINKTABLE', # CCM link table used to merge CRSP and Compustat
'CRSP.MSEDELIST', # CRSP monthly delist events
'CRSP.MSF', # CRSP monthly stock file
'CRSP.MSENAMES', # CRSP monthly event file
'COMPA.FUNDA', # Compustat annual observations
'COMPA.FUNDQ'] # Compustat quarterly observations
# list of created tables
CREATED_TABLES = ['CRSP_M', # CRSP monthly file (i.e. merged CRSP.MSF, CRSP.MSENAMES, CRSP.MSEDELIST)
'CCM'] # CRSP/Compustat merged file
# set class attributes
self.WRDS_username = WRDS_username
if(local_db_path is None):
# set location to be in the current directory
self.local_db_path = pathlib.Path('WRDS.db')
else:
# user specified loaction
self.local_db_path = local_db_path
self.update_all_tables = update_all_tables
self.tables_to_update = tables_to_update
self.update_tolerance = update_tolerance
self.today_date = datetime.date.today().strftime('%Y-%m-%d')
# used to update all tables
if(self.update_all_tables and self.local_db_path.exists()):
cprint.warn('Updating the tables in the local database. This process could take a long time...')
os.remove(self.local_db_path)
# create sql engine
self.sql_engine = sqlalchemy.create_engine('sqlite:///' + str(self.local_db_path))
# list of current tables
# check to see if all required tables are present, if not load the ones that are missing
inspect = sqlalchemy.inspect(self.sql_engine)
self.curr_tables = inspect.get_table_names()
# delete tables that should be updated
for table_name in self.tables_to_update:
if(table_name in self.curr_tables):
with self.sql_engine.connect() as conn:
if(table_name not in CREATED_TABLES):
_ = conn.execute(f"""DROP TABLE {table_name}""")
# drop combo files and remake
for cr_table in CREATED_TABLES:
_ = conn.execute(f'DROP TABLE {cr_table}')
else:
# only drop the created table and remake
_ = conn.execute(f'DROP TABLE {table_name}')
table_names = [name.replace('.', '_') for name in WRDS_TABLES] # local table names
# check CSV directory for files to include
CSV_directory = self.local_db_path.parent / 'CSVtoSQL'
for csvfile in os.listdir(CSV_directory):
f = os.path.join(CSV_directory, csvfile)
if(os.path.isfile(f)):
filepath = pathlib.Path(f)
tablename = filepath.name.strip('.csv')
if(tablename in self.curr_tables): continue
s = time.time()
cprint.info(f'Adding {filepath.name} to SQL database {self.local_db_path.name}...')
subprocess.call(['sqlite3', f'{self.local_db_path}', '.mode csv', f'.import {filepath} {tablename}', '.mode columns'])
e = time.time()
cprint.info(f'Finished {filepath.name}: {round(e - s, 3)}s')
# update current tables
self.curr_tables = inspect.get_table_names()
# read in the data from WRDS
if(not all(elem in self.curr_tables for elem in table_names)):
missing_tables = list(set(table_names) - set(inspect.get_table_names()))
cprint.warn(f'The following tables are missing from the local database: {missing_tables}. Querying WRDS to add them to the local database.')
cprint.info('Connecting to WRDS...')
self.WRDS_db = wrds.Connection(username = self.WRDS_username)
for table_name in missing_tables:
table = table_name.replace('_', '.', 1)
print('-------------------------------------')
cprint.info(f'Starting {table}')
s = time.time()
sql_str = '''SELECT * FROM ''' + table
# download the data to a dataframe
df = self.WRDS_db.raw_sql(sql_str)
cprint.ok(f'Dataframe in memory: {asizeof.asizeof(df) / (10 ** 9)}GB')
# add end of month column for CRSP_MSEDELIST
if(table_name == 'CRSP_MSEDELIST'): df['date'] = df.dlstdt + MonthEnd(0)
if(table_name == 'CRSP_DSEDELIST'): df['date'] = df.dlstdt # create date column for merging
# write the dataframe to the local sql database
df.to_sql(table_name, con = self.sql_engine, if_exists = 'replace', index = False)
del df
e = time.time()
cprint.info(f'Finished {table}: {round(e - s, 3)}s')
print('-------------------------------------\n')
cprint.info('Raw WRDS files have been added to the local databse.')
if('CRSP_M' not in self.curr_tables):
cprint.info(f'Creating combined data table CRSP_M...')
sf_df = pd.read_sql(f"""SELECT * FROM CRSP_MSF LIMIT 1""", con = self.sql_engine)
names_df = pd.read_sql(f"""SELECT * FROM CRSP_MSENAMES LIMIT 1""", con = self.sql_engine)
delsit_df = pd.read_sql(f"""SELECT * FROM CRSP_MSEDELIST LIMIT 1""", con = self.sql_engine)
vars_to_select = ''
for var in list(sf_df.columns):
vars_to_select += f'CRSP_MSF.{var}, '
for var in list(set(list(names_df.columns)) - set(list(sf_df.columns))):
vars_to_select += f'CRSP_MSENAMES.{var}, '
for var in list(set(list(delsit_df.columns)) - set(list(sf_df.columns)) - set(list(names_df.columns))):
vars_to_select += f'CRSP_MSEDELIST.{var}, '
vars_to_select = vars_to_select[:-2]
sql_dic = {'vars': vars_to_select}
sql_str = '''CREATE TABLE CRSP_M AS
SELECT {0} FROM CRSP_MSF
LEFT JOIN CRSP_MSENAMES ON CRSP_MSF.permno = CRSP_MSENAMES.permno AND CRSP_MSENAMES.namedt <= CRSP_MSF.date AND CRSP_MSF.date <= CRSP_MSENAMES.nameendt
LEFT JOIN CRSP_MSEDELIST ON CRSP_MSF.permno = CRSP_MSEDELIST.permno AND CRSP_MSF.date = CRSP_MSEDELIST.date'''.format(sql_dic['vars'])
with self.sql_engine.connect() as conn:
_ = conn.execute(sql_str)
cprint.info('Combined CRSP tables have been created.')
# create merged CRSP and Compustat table
if('CCM' not in self.curr_tables):
table = 'CCM'
cprint.info(f'Creating combined CRSP and Compustat table: {table}')
start_date = datetime.date(1900, 6, 30)
end_date = datetime.date(2100, 6, 30)
# Compustat -------------------------------------------------------------------------------------------
COMP_df = self.query_Compustat(start_date, end_date, 'A', sub_vars = ['ticker'], add_vars = ['years_in', 'fyear', 'revt', 'adjex_f'])
COMP_df['year'] = COMP_df.datadate.dt.year
# create preferrerd stock
COMP_df['ps'] = np.where(COMP_df.pstkrv.isnull(), COMP_df.pstkl, COMP_df.pstkrv)
COMP_df.ps = np.where(COMP_df.ps.isnull(), COMP_df.pstk, COMP_df.ps)
COMP_df.ps = np.where(COMP_df.ps.isnull(), 0, COMP_df.ps)
COMP_df.txditc = COMP_df.txditc.fillna(0)
# create book equity
COMP_df['be'] = np.where(COMP_df.fyear < 1993, COMP_df.seq + COMP_df.txditc - COMP_df.ps, COMP_df.seq - COMP_df.ps)
# earnings
COMP_df['earn'] = np.where(~COMP_df.ib.isnull(), COMP_df.ib, np.nan)
# operating profitability
COMP_df['xp_allnan'] = (COMP_df.cogs.isnull()) & (COMP_df.xsga.isnull()) & (COMP_df.xint.isnull())
COMP_df['profit'] = COMP_df.revt - COMP_df.cogs.fillna(0) - COMP_df.xint.fillna(0) - COMP_df.xsga.fillna(0)
COMP_df['op'] = np.where(COMP_df.be + COMP_df.mib != 0, COMP_df.profit / (COMP_df.be + COMP_df.mib.fillna(0)), np.nan)
COMP_df.op = np.where(((~COMP_df.op.isnull()) & (~COMP_df.revt.isnull()) & (~COMP_df.xp_allnan)), COMP_df.op, np.nan)
# NOTE: Compustat data yields gross outliers in 'op' w/ ratios as large as '1,000'.
# To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
# values for 'op' outside the 99th percentile are set to missing.
COMP_df.op = np.where((COMP_df.op <= COMP_df.op.quantile(0.99)), COMP_df.op, np.nan)
# investment
try:
COMP_df['inv'] = np.log(COMP_df['at']) - np.log(COMP_df.groupby(by = ['gvkey'])['at'].shift(1))
except FloatingPointError:
COMP_df['inv'] = (COMP_df['at'] / COMP_df.groupby(by = ['gvkey'])['at'].shift(1)) - 1
COMP_df.inv = np.where(~COMP_df.inv.isnull(), COMP_df.inv, np.nan)
# NOTE: Compustat data yields gross outliers in 'inv' w/ percentages as low as '-100%' and as large as '10,000%'.
# These outliers are pervasive on the left tail of the distribution.
# To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
# values for 'inv' outside [15th, 99th] percentiles are winsorized.
COMP_df.inv = np.where((COMP_df.inv.quantile(0.15) <= COMP_df.inv), COMP_df.inv, COMP_df.inv.quantile(0.15))
COMP_df.inv = np.where((COMP_df.inv <= COMP_df.inv.quantile(0.99)), COMP_df.inv, COMP_df.inv.quantile(0.99))
# cash flow
COMP_df['cf'] = COMP_df.ib + COMP_df.txdi.fillna(0) + COMP_df.dpre.fillna(0)
COMP_df.cf = np.where(~COMP_df.cf.isnull(), COMP_df.cf, np.nan)
# accruals
COMP_df['csho_adj'] = np.where((COMP_df.csho * COMP_df.adjex_f > 0), COMP_df.csho * COMP_df.adjex_f, np.nan)
COMP_df['owcap_adj'] = ((COMP_df.act - COMP_df.che) - (COMP_df.lct.fillna(0) - COMP_df.dlc.fillna(0))) / COMP_df.csho_adj
COMP_df['d_owcap_adj'] = COMP_df.owcap_adj - COMP_df.groupby(by = ['gvkey'])['owcap_adj'].shift(1)
COMP_df['ac'] = np.where(~COMP_df.csho_adj.isnull(), COMP_df.d_owcap_adj / (COMP_df.be / COMP_df.csho_adj), np.nan)
# NOTE: Compustat data yields gross outliers in 'ac' for June of each year {t} w/ ratios as low as '-200' and as large as '200'.
# To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
# values for 'ac' less than '-200' and values for 'ac' larger than '200' are set to missing.
COMP_df.ac = np.where(-200 <= COMP_df.ac, COMP_df.ac, np.nan)
COMP_df.ac = np.where(COMP_df.ac <= 200, COMP_df.ac, np.nan)
# net shares issused
COMP_df['ni_csho_adj'] = np.where(COMP_df.csho * COMP_df.adjex_f > 0, COMP_df.csho * COMP_df.adjex_f, np.nan)
try:
COMP_df['nsi'] = np.log(COMP_df.ni_csho_adj) - np.log(COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1))
except FloatingPointError:
COMP_df['nsi'] = (COMP_df.ni_csho_adj / COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1)) - 1
COMP_df.nsi = np.where(~COMP_df.nsi.isnull(), COMP_df.nsi, np.nan)
# NOTE: Compustat data yields outliers in 'ni' w/ ratios as large as '20'.
# To be consistent w/ summary statistics for characteristics provided by Ken French's online library,
# values for 'ni' outside the 99.9th percentile are set to missing.
COMP_df.nsi = np.where(COMP_df.nsi <= COMP_df.nsi.quantile(0.999), COMP_df.nsi, np.nan)
COMP_df = COMP_df.drop(columns = ['owcap_adj', 'xp_allnan'])
# CRSP ------------------------------------------------------------------------------------------------
CRSP_df = self.query_CRSP(start_date, end_date, 'M')
CRSP_df['jdate'] = CRSP_df.date
# create timing columns
CRSP_df['year'] = CRSP_df['jdate'].dt.year
CRSP_df['month'] = CRSP_df['jdate'].dt.month
# turnover (turn)
# The turnover (TURN) for each stock is defined the monthly traded volume scaled by the total number of shares outstanding.
CRSP_df['turn'] = CRSP_df.vol / CRSP_df.shrout
# traded volume in dollars (dvol)
# The traded volume in dollars (DVOL) is defined as the number of shares traded in a given month multiplied by the closing stock price.
CRSP_df['dvol'] = CRSP_df.vol * CRSP_df.prc.abs()
# calculate prior returns
# Prior 1-1 is the cummulative return in [t - 1]
# Prior 2-12 is the cummulative return from [t - 12] to [t - 2]
# Prior 13-60 is the cummulative return from [t - 60] to [t - 13]
for ret_typ in ['adjret', 'adjretx']:
for typ in [(1, 1), (2, 12), (13, 60)]:
name = f'pr{typ[0]}_{typ[1]}' if(ret_typ == 'adjret') else f'prx{typ[0]}_{typ[1]}'
CRSP_df[name] = 1
for i in range(typ[0], typ[1] + 1):
CRSP_df[f'{ret_typ}_L{i}'] = 1 + CRSP_df.groupby(by = ['permno'])[ret_typ].shift(i)
CRSP_df[name] *= CRSP_df[f'{ret_typ}_L{i}']
CRSP_df = CRSP_df.drop(CRSP_df.filter(regex = '_L').columns, axis = 1)
CRSP_df[name] -= 1
### Aggregate Market Cap ###
# sum of me across different permno belonging to same permco a given date
crsp_summe = CRSP_df.groupby(['jdate','permco'])['me'].sum().reset_index()
# largest mktcap within a permco/date
crsp_maxme = CRSP_df.groupby(['jdate','permco'])['me'].max().reset_index()
# join by jdate/maxme to find the permno
CRSP_df = CRSP_df.merge(crsp_maxme, how='inner', on=['jdate','permco','me'])
# drop me column and replace with the sum me
CRSP_df = CRSP_df.drop(columns = ['me'])
# join with sum of me to get the correct market cap info
CRSP_df = CRSP_df.merge(crsp_summe, how='inner', on=['jdate','permco'])
### July to June dates
CRSP_df['ffdate'] = CRSP_df['jdate'] + MonthEnd(-6)
CRSP_df['ffyear'] = CRSP_df['ffdate'].dt.year
CRSP_df['ffmonth'] = CRSP_df['ffdate'].dt.month
CRSP_df['1+adjretx'] = 1 + CRSP_df.adjretx
# cumret by stock and fama-french year
CRSP_df['ffyear_cumretx'] = CRSP_df.groupby(by = ['permno', 'ffyear'])['1+adjretx'].cumprod()
# lag of ffyear_cumretx
CRSP_df['L_ffyear_cumretx'] = CRSP_df.groupby(by = ['permno'])['ffyear_cumretx'].shift(1)
# lag market cap
CRSP_df['L_me']=CRSP_df.groupby(by = ['permno'])['me'].shift(1)
# if first permno then use me/(1+retx) to replace the missing value
CRSP_df['months_in'] = CRSP_df.groupby(by = ['permno']).cumcount()
CRSP_df.L_me = np.where(CRSP_df.months_in == 0, CRSP_df.me / CRSP_df['1+adjretx'], CRSP_df.L_me)
# baseline me
mebase = CRSP_df[CRSP_df.ffmonth == 1][['permno','ffyear', 'L_me']].rename(columns={'L_me': 'mebase'})
# merge result back together
CRSP_df = CRSP_df.merge(mebase, how = 'left', on = ['permno', 'ffyear'])
CRSP_df['wt'] = np.where(CRSP_df.ffmonth == 1, CRSP_df.L_me, CRSP_df.mebase * CRSP_df.L_ffyear_cumretx)
# Info as of June & December and merge together for characteristics
CRSP_JUN_df = CRSP_df[CRSP_df.month == 6]
CRSP_DEC_df = CRSP_df[CRSP_df.month == 12]
CRSP_DEC_df = CRSP_DEC_df[['permno','date','jdate','me','year']].rename(columns={'me': 'dec_me'})
CRSP_DEC_df.year += 1
CRSP_DEC_df = CRSP_DEC_df[['permno','year','dec_me']]
CRSP_JUN_df = CRSP_JUN_df.merge(CRSP_DEC_df, how = 'inner', on = ['permno', 'year'])
CRSP_JUN_df = CRSP_JUN_df[['permno', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'mebase', 'L_me', 'dec_me']]
CRSP_JUN_df = CRSP_JUN_df.drop_duplicates()
# query the link table
link_table_df = self.query_link_table()
CCM_df = COMP_df.merge(link_table_df, how='left', on=['gvkey'])
CCM_df['jdate'] = CCM_df.year_end + MonthEnd(6)
# set link date bounds
CCM_df = CCM_df[(CCM_df.jdate >= CCM_df.linkdt) & (CCM_df.jdate <= CCM_df.linkenddt)]
# link COMP_df and crsp
CCM_JUN_df = CRSP_JUN_df.merge(CCM_df, how = 'inner', on = ['permno', 'jdate'])
CCM_JUN_df['ffyear'] = CCM_JUN_df.jdate.dt.year
# drop columns before merging
CCM_JUN_df = CCM_JUN_df.drop(columns = ['mebase', 'L_me', 'linktype', 'linkprim', 'linkenddt', 'linkdt', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'year', 'permco'])
CRSP_df = CRSP_df.drop(columns = ['L_me', 'L_ffyear_cumretx', 'ffyear_cumretx', '1+adjretx', 'mebase', 'jdate'])
# merge to monthly
CCM_df = CRSP_df.merge(CCM_JUN_df, how = 'left', on = ['permno', 'ffyear'])
CCM_df = CCM_df.dropna(subset = ['datadate'])
CCM_df = CCM_df.sort_values(by = ['permno', 'date'])
# create characterisitcs
# book to market
CCM_df['ffbm'] = np.where(CCM_df.dec_me != 0, CCM_df.be / CCM_df.dec_me, np.nan)
CCM_df['bm'] = np.where(CCM_df.me != 0, CCM_df.be / CCM_df.me, np.nan)
# earnings to price
CCM_df['ffep'] = np.where(CCM_df.dec_me != 0, CCM_df.earn / CCM_df.dec_me, np.nan)
CCM_df['ep'] = np.where(CCM_df.me != 0, CCM_df.earn / CCM_df.me, np.nan)
# cashflow to price
CCM_df['ffcfp'] = np.where(CCM_df.dec_me != 0, CCM_df.cf / CCM_df.dec_me, np.nan)
CCM_df['cfp'] = np.where(CCM_df.me != 0, CCM_df.cf / CCM_df.me, np.nan)
# market beta
mkt_s = CCM_df.groupby('date').apply(wavg, 'adjret', 'me')
mkt_s.name = 'mkt'
mkt_s = mkt_s.to_frame().reset_index()
CCM_df = CCM_df.merge(mkt_s, how = 'left', on = ['date'])
CCM_df['cov'] = CCM_df.groupby(['permno'])[['adjret', 'mkt']].rolling(min_periods = 24, window = 60).cov().unstack()['adjret']['mkt'].reset_index(level = 'permno').rename(columns = {'mkt': 'cov'})[['cov']]
CCM_df['var'] = CCM_df.groupby(['permno'])['mkt'].rolling(min_periods = 24, window = 60).var().reset_index(level = 'permno').rename(columns = {'mkt': 'var'})[['var']]
CCM_df['beta'] = CCM_df['cov'] / CCM_df['var']
## additional characteristics
# sale-to-price
CCM_df['sp'] = np.where(CCM_df.me != 0, CCM_df.sale / CCM_df.me, np.nan)
#### ISSUES WITH GROWTH VARS
# earnings growth
CCM_df['eg'] = CCM_df.earn.pct_change()
# grwoth in sales
CCM_df['grs'] = CCM_df.sale.pct_change()
# growth in book value
CCM_df['grb'] = CCM_df.be.pct_change()
# growth in cashflow
CCM_df['grcf'] = CCM_df.cf.pct_change()
def _percentile_rank(df, var):
ptiles = list(df[var].quantile(q = list(np.arange(start = 0, step = 0.01, stop = 1))))
df[f'{var}_pr'] = 100
for i in range(99, 0, -1):
mask = df[var] < ptiles[i]
df.loc[mask, f'{var}_pr'] = i
return(df)
for var in ['bm', 'sp', 'cfp', 'dp', 'eg', 'grs', 'grcf', 'grb']:
CCM_df = _percentile_rank(CCM_df, var)
s = time.time()
# Pastor-Stambaugh Liquidty Index (psliq)
# The Pastor-Stambaugh Liquidity Index (PSLIQ) is defined for each stock as follow. We run one regression for each calendar month based on each stock’s daily return, using the current daily
# return in the left-hand side. The right-hand side variables are the lagged daily return as well as the lagged daily return interacted with the lagged traded dollar volume. The coefficient of the
# interaction term is the measure of liquidity – for each stock and each month.
#
e = time.time()
print(CCM_df.head(50))
print(f'time = {e - s}')
# save merged dataframe to SQL database
CCM_df = CCM_df.drop_duplicates()
CCM_df.to_sql(table, con = self.sql_engine, if_exists = 'replace', index = False)
cprint.ok('CRSP and Compustat merged table has been created.')
def __str__(self) -> str:
#TODO: print tables and columns from loacl database
return(f"WRDS Username: {self.username}")
def raw_sql(self, sql_str):
"""
Allows the user to use raw SQL on the underlying database.
Note
_____
This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.
"""
cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:')
response = input()
if(response == 'y'):
raw_df = pd.read_sql(sql_str, con = self.sql_engine)
return(raw_df)
else:
cprint.info('Operation cancelled.')
return(None)
def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame:
"""
Used to query the merged CRSP/Compustat (CCM) table.
Parameters
___________
start_date: The starting date of the data query.
end_date: The ending date of the data query.
freq: The frequency of the data query.
Choices are:
* Q: quarterly
* A: annual
Keyword Arguments
__________________
vars: list; The variables to query for.
add_vars: list; Additional variables to query for ontop of the default variables.
sub_vars: list; Variables to remove from the default variables.
all_vars: bool; Set to true to query for all variables in the table.
id_type: str; Type of ID used to query for specific assets.
Choices are:
* ticker
* gvkey
* permno
* cusip
* permco
ids: list; The ids of type 'id_type' to query for.
Note
_____
The variables that can be queiried for are:
'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq',
'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd',
'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'
Note
_____
If the frequency is quarterly the variables that can be queiried for are the same as the annual file except
for 'pstkrv' and 'pstkl'.
Note
_____
The defualt variables that are queried for are:
'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',
'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'
"""
# vars that can be queiried for
VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60',
'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc',
'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm',
'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta']
# for annual
#VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq',
# 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout',
# 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev']
# if no keyword arguments are given then these are the defaults returned
DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx',
'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear']
VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float,
'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float,
'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float,
'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float,
'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int,
'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float,
'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float}
#############################################################################################################################################
# Query Validation and Error Checking
#############################################################################################################################################
# keywrods 'additional_vars' and 'vars' cannot be used simultaneously
if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')
# create list of the variables being quireied
query_vars = None
if('vars' in kwargs):
# variable arguments to query for
query_vars = kwargs['vars']
# 'permno' needs to be in the query vars for merging
if('permno' not in query_vars): query_vars.insert(0, 'permno')
# add date if people forgot
if('date' not in query_vars): query_vars.insert(0, 'date')
else:
if('add_vars' in kwargs):
query_vars = DEFAULT_VARS + kwargs['add_vars']
else:
query_vars = DEFAULT_VARS
if('sub_vars' in kwargs):
query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
if('all_vars' in kwargs): query_vars = VALID_VARS
# make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
all_valid = all(elem in VALID_VARS for elem in query_vars)
if(not all_valid):
incorrect_vars = list(set(query_vars) - set(VALID_VARS))
raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.')
specific_query = False
id_type = ''
ids = []
if('id_type' in kwargs or 'ids' in kwargs):
if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
specific_query = True
id_type = kwargs['id_type']
ids = kwargs['ids']
##############################################################################################################################################
# Load the raw data
##############################################################################################################################################
# read in raw dataframe from local sql database
raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars,
specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)
##############################################################################################################################################
# Clean the raw data
##############################################################################################################################################
# I HATE U SEC
if(query_vars is None):
raw_df.fyear = raw_df.fyear.astype(float)
if(not query_vars is None):
if('fyear' in query_vars):
raw_df.fyear = raw_df.fyear.astype(float)
# get vars in the dataframe
quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end']))
vars_dtypes = {}
for var in quried_vars:
vars_dtypes[var] = VARS_DATA_TYPE[var]
# convert to correct data types
raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d')
raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d')
raw_df = raw_df.astype(vars_dtypes)
# replace and python objects 'None' to np.nan
raw_df = raw_df.fillna(value = np.nan)
# reset to original variables, drop duplicates, and reset the index
raw_df = raw_df[query_vars]
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.sort_values(by = ['permno', 'date'])
raw_df = raw_df.reset_index(drop = True)
# return dataframe
return(raw_df)
def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame:
"""
Used to query the raw Compustat tables.
Parameters
___________
start_date: The starting date of the data query.
end_date: The ending date of the data query.
freq: The frequency of the data query.
Choices are:
* Q: quarterly
* A: annual
Keyword Arguments
__________________
vars: list; The variables to query for.
add_vars: list; Additional variables to query for ontop of the default variables.
sub_vars: list; Variables to remove from the default variables.
all_vars: bool; Set to true to query for all variables in the table.
id_type: str; Type of ID used to query for specific assets.
Choices are:
* ticker
* gvkey
* permno
* cusip
* permco
ids: list; The ids of type 'id_type' to query for.
Note
_____
The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual
file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the
fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to
the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make
it easier to compute the anomally characterisitcs when creating the combined CCM tables.
Note
_____
By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks.
To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's
naming conventions between their annual and quarterly files.
Note
_____
The defualt variables that are queried for are if the frequency given is annual:
'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq',
'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'
If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.
Note
_____
There is less error checking in this function compared to the other methods in this class because of the large number of variables
in Compustat.
"""
STD_VARS = None
if(freq == 'A'):
STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp']
else:
STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq']
DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float,
'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float}
CREATED_VARS = ['years_in']
#############################################################################################################################################
# Query Validation and Error Checking
#############################################################################################################################################
if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat')
# keywrods 'additional_vars' and 'vars' cannot be used simultaneously
if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')
# create list of the variables being quireied
query_vars = None
if('vars' in kwargs):
# variable arguments to query for
query_vars = kwargs['vars']
# 'permno' needs to be in the query vars for merging
if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey')
# add date if people forgot
if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate')
else:
if('add_vars' in kwargs):
query_vars = STD_VARS + kwargs['add_vars']
else:
query_vars = STD_VARS
if('sub_vars' in kwargs):
sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']]
query_vars = [elem for elem in query_vars if elem not in sub_vars]
query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars]
query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars]
query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars]
if('all_vars' in kwargs): query_vars = None
indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL'] # default: Industrial, Financial
datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD'] # default: Standard
popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D'] # default: Consolidated
consols = kwargs['consols'] if('consols' in kwargs) else ['C'] # default: Consolidated
specific_query = False
id_type = ''
ids = []
if('id_type' in kwargs or 'ids' in kwargs):
if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
specific_query = True
id_type = kwargs['id_type']
if(id_type == 'ticker'): id_type = 'tic'
ids = kwargs['ids']
query_vars_DB = list(set(query_vars) - set(CREATED_VARS))
##############################################################################################################################################
# Load the raw data
##############################################################################################################################################
# read in raw dataframe from local sql database
raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB,
indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols,
specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)
##############################################################################################################################################
# Clean the raw data
##############################################################################################################################################
# rename columns
raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'})
# rename the default columns to match the names from the COMPA_FUNDA
if(freq == 'Q'):
# quarterly compustat
# dont balme me for the different names blame compustat
raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq',
'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale',
'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib',
'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f',
'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'})
else:
# annual compustat
# rename columns for consistency
raw_df = raw_df.rename(columns = {'dp': 'dpre'})
# make date a datetime.date and align to the end of the year/quarter
raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d')
if(freq == 'A'):
raw_df['year_end'] = raw_df.datadate + YearEnd(0)
else:
raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0)
# I HATE U S&P
if('fyear' in query_vars):
raw_df.fyear = raw_df.fyear.astype(float)
# add years in to compustat
if('years_in' in query_vars):
raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount()
# get vars in the dataframe
quried_vars = list(set(list(raw_df.columns)) - set(['date']))
vars_dtypes = {}
for var in quried_vars:
if(var in DEFAULT_DTYPES):
vars_dtypes[var] = DEFAULT_DTYPES[var]
# convert dtypes
raw_df = raw_df.fillna(value = np.nan)
raw_df = raw_df.astype(vars_dtypes)
# sort just for ease of reading
raw_df = raw_df.drop_duplicates()
sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end']
raw_df = raw_df.sort_values(by = sorting_dims)
raw_df = raw_df.reset_index(drop = True)
# return the dataframe
return(raw_df)
def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame:
"""
Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.
Parameters
___________
start_date: The starting date of the data query.
end_date: The ending date of the data query.
freq: The frequency of the data query.
Choices are:
* M: quarterly
* D: annual
adj_stocksplit: default = True; Whether or not to adjust for a stock split event.
Keyword Arguments
__________________
vars: list; The variables to query for.
add_vars: list; Additional variables to query for ontop of the default variables.
sub_vars: list; Variables to remove from the default variables.
all_vars: bool; Set to true to query for all variables in the table.
id_type: str; Type of ID used to query for specific assets.
Choices are:
* ticker
* gvkey
* permno
* cusip
* permco
ids: list; The ids of type 'id_type' to query for.
exchcds: list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ)
shrcds: list; The share codes to use for querying (default: US Common Stock)
Note
_____
If the frequency is monthly then the variables that can be quiried for are:
'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd',
'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd',
'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc',
'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'
Note
_____
If the frequency is daily then the variables that can be quiried for are:
'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno',
'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd',
'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp',
'nwperm', 'dlamt', 'openprc', 'numtrd'
Note
_____
The variabels that are created from CRSP primatives are:
* 'me': Market Equity (millions)
* 'adjret': Returns adjusted for delisting events
* 'adjretx': Returns adjusted for delisting events ex. dividend
* 'dvd': Dividend (uses 'adjret' and 'adjretx' to calculate)
* 'dp': Dividend-to-Price Ratio
Note
_____
This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11
and an exchange code of 1, 2, or 3.
"""
# variables that can be queried for
STD_VARS = None
if(freq == 'M'):
STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno',
'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr',
'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd',
'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt']
else:
STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno',
'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr',
'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd',
'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd']
CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx']
VALID_VARS = STD_VARS + CREATE_VARS
# if no keyword arguments are given then these are the defaults returned
DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol']
# variable data types
VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str,
'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str,
'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str,
'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float,
'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float,
'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float,
'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float,
'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float}
#############################################################################################################################################
# Query Validation and Error Checking
#############################################################################################################################################
if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP')
# keywrods 'additional_vars' and 'vars' cannot be used simultaneously
if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously')
# create list of the variables being quireied
query_vars = None
if('vars' in kwargs):
# variable arguments to query for
query_vars = kwargs['vars']
# 'permno' needs to be in the query vars for merging
if('permno' not in query_vars): query_vars.insert(0, 'permno')
# add date if people forgot
if('date' not in query_vars): query_vars.insert(0, 'date')
else:
if('add_vars' in kwargs):
query_vars = DEFAULT_VARS + kwargs['add_vars']
else:
query_vars = DEFAULT_VARS
if('sub_vars' in kwargs):
query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']]
if('all_vars' in kwargs): query_vars = VALID_VARS
# used for dataframe formatting at the end
og_vars = query_vars.copy()
# make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it)
all_valid = all(elem in VALID_VARS for elem in query_vars)
if(not all_valid):
incorrect_vars = list(set(query_vars) - set(VALID_VARS))
raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.')
# always adjust for stock splits (can disable this)
if(adj_stocksplit):
if('prc' not in query_vars): query_vars.append('prc')
if('cfacpr' not in query_vars): query_vars.append('cfacpr')
if('shrout' not in query_vars): query_vars.append('shrout')
if('cfacshr' not in query_vars): query_vars.append('cfacshr')
# make sure if created variables are being queiried for then add the variables needed to create them
if('me' in query_vars):
if('prc' not in query_vars): query_vars.append('prc')
if('shrout' not in query_vars): query_vars.append('shrout')
if('dp' in query_vars):
if('dvd' not in query_vars): query_vars.append('dvd')
if('dvd' in query_vars):
if('adjret' not in query_vars): query_vars.append('adjret')
if('adjretx' not in query_vars): query_vars.append('adjretx')
if('prc' not in query_vars): query_vars.append('prc')
if('adjret' in query_vars):
if('ret' not in query_vars): query_vars.append('ret')
if('dlret' not in query_vars): query_vars.append('dlret')
if('adjretx' in query_vars):
if('retx' not in query_vars): query_vars.append('retx')
if('dlretx' not in query_vars): query_vars.append('dlretx')
if('cumret' in query_vars):
if('ret' not in query_vars): query_vars.append('ret')
if('cumretx' in query_vars):
if('retx' not in query_vars): query_vars.append('retx')
if('adjcumret' in query_vars):
if('ret' not in query_vars): query_vars.append('ret')
if('dlret' not in query_vars): query_vars.append('dlret')
if('adjcumretx' in query_vars):
if('retx' not in query_vars): query_vars.append('retx')
if('dlretx' not in query_vars): query_vars.append('dlretx')
exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ
shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11] # default: US-based common stock
specific_query = False
id_type = ''
ids = []
if('id_type' in kwargs or 'ids' in kwargs):
if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.')
if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.')
specific_query = True
id_type = kwargs['id_type']
ids = kwargs['ids']
# created vars are not in the table so remove them
db_vars = [var for var in query_vars if var not in CREATE_VARS]
##############################################################################################################################################
# Load the raw data
##############################################################################################################################################
# read in raw dataframe from local sql database
raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds,
specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine)
##############################################################################################################################################
# Clean the raw data
##############################################################################################################################################
# I HATE U SEC
DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm']
for var in DOWNCAST_VARS:
if(var in query_vars):
raw_df[var] = raw_df[var].astype('Int64')
# get vars in the dataframe
quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt']))
vars_dtypes = {}
for var in quried_vars:
vars_dtypes[var] = VARS_DATA_TYPE[var]
# convert dates to datetime.dates and align to end of month
raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d')
if(freq == 'M'): raw_df.date += MonthEnd(0)
if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d')
if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d')
if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d')
if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d')
if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d')
if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d')
# make sure that the data is the correct type
raw_df = raw_df.astype(vars_dtypes)
# replace and python objects 'None' to np.nan
raw_df = raw_df.fillna(value = np.nan)
# adjust for stock splits
if(adj_stocksplit):
raw_df.prc /= raw_df.cfacpr
raw_df.shrout *= raw_df.cfacshr
# Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP.
# SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
if('me' in query_vars):
raw_df['me'] = raw_df.prc.abs() * raw_df.shrout
# convert market equity to $millions
raw_df.me /= 1e3
# adjust for delisting return
if('adjret' in query_vars):
raw_df.dlret = raw_df.dlret.fillna(value = 0.0)
raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1
if('adjretx' in query_vars):
raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0)
raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1
# create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order
if('adjret' in query_vars and 'adjretx' in query_vars):
raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs()
# create cummulative returns
if('cumret' in query_vars):
raw_df['1+ret'] = 1 + raw_df.ret
raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod()
raw_df = raw_df.drop(columns = ['1+ret'])
if('cumretx' in query_vars):
raw_df['1+retx'] = 1 + raw_df.ret
raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod()
raw_df = raw_df.drop(columns = ['1+retx'])
if('adjcumret' in query_vars):
raw_df['1+adjret'] = 1 + raw_df.ret
raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod()
raw_df = raw_df.drop(columns = ['1+adjret'])
if('adjcumret' in query_vars):
raw_df['1+adjretx'] = 1 + raw_df.ret
raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod()
raw_df = raw_df.drop(columns = ['1+adjretx'])
# Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1
# to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns
# from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25.
# SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html
# NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months.
if('dp' in query_vars):
if(freq == 'M'):
min_periods = 7
window = 12
else:
min_periods = 147 # 252 days / 12 months * 7 months
window = 252
raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']]
raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs()
raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp)
raw_df = raw_df.drop(columns = ['cumdvd'])
# reset to original variables, drop duplicates, and reset the index
raw_df = raw_df[og_vars]
raw_df = raw_df.drop_duplicates()
raw_df = raw_df.sort_values(by = ['permno', 'date'])
raw_df = raw_df.reset_index(drop = True)
# return the raw dataframe and path where it was saved
return(raw_df)
def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame:
"""
Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.
Parameters
___________
start_date: datetime.date\n
Starting date of the dataset being queried.
end_date: datetime.date\n
Ending date of the dataset being queried.
obs_freq: str\n
The observational frequency of the CRSP database being queried.
Choices are:
* 'D' : daily
* 'M' : monthly
* 'A' : annually
Returns
________
full_df: pd.DataFrame\n
Risk-free rate data.
Note
_____
The dataframe returned makes adjustments for NYSE holidays during compounding.
Note
_____
List of queried CRSP variables:\n
* date : Date of observation
* rf : Risk-free rate
"""
# Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't
# coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest.
if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()):
start_date = (start_date + MonthBegin(-1)).date()
# load in dataframe
raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine)
# convert dates to datetimes
raw_df['date'] = pd.to_datetime(raw_df['date'])
# Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency.
if(obs_freq == 'M'):
raw_df['date'] = raw_df['date'] + MonthEnd(0)
elif(obs_freq == 'A'):
raw_df['date'] = raw_df['date'] + YearEnd(0)
# return the raw dataframe
return(raw_df)
def query_link_table(self) -> pd.DataFrame:
"""
Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to
Compustat companies on permno and gvkey.
Returns
________
raw_df: pd.DataFrame\n
The raw dataframe pulled from local WRDS database.
Note
_____
Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.
"""
sql_str = """
SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt
FROM CRSP_CCMXPF_LINKTABLE
WHERE substr(linktype, 1, 1) = 'L'
AND (linkprim = 'C' or linkprim = 'P')
"""
# read in raw dataframe from local database
raw_df = pd.read_sql(sql_str, con = self.sql_engine)
# convert permno and permco to string
raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer')
raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer')
# convert identifiers to strings
raw_df.gvkey = raw_df.gvkey.astype(str)
raw_df.permno = raw_df.permno.astype(str)
raw_df.permco = raw_df.permco.astype(str)
# if linkenddt is missing the set to todays date
raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date())
# convert to datetimes
raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]')
raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]')
# return the raw dataframe
return(raw_df)
# ----------------------------------------------------------------------------------------------------------------------------
# INTERNAL METHODS (class <QueryWRDS>)
#
# These are internal methods and should only be called within this class. Functionality and accuracy of these methods cannot
# garunteed if they are called outside of this class.
# ----------------------------------------------------------------------------------------------------------------------------
# INTERNAL METHOD
def _list_to_sql_str(self, lst: list, table: str = None) -> str:
res = ''
for var in lst:
if(table is None):
res += f'\'{var}\', '
else:
res += f'{table}.{var}, '
res = res[:-2]
return(res)
# INTERNAL METHOD
def _CCM_sql_query(self, start_date: datetime.date, end_date: datetime.date, vars: list, specific_query: bool, id_type: str, ids: list):
sql_str = ''
table = 'CCM'
# convert date time object to strings for the SQL query
start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\''
# create argument string
sql_str += f'SELECT {self._list_to_sql_str(vars, table)} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'
# additional subsetting
if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'
return(sql_str)
# INTERNAL METHOD
def _compustat_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, indfmt: list, datafmt: list, popsrc: list, consol: list, specific_query: bool, id_type: str, ids: list) -> str:
"""
INTERNAL METHOD: Create SQL string used to query the WRDS Compustat database.
Parameters
___________
start_date: Starting date for the dataset queried.
end_date: Ending date for the dataset queried.
freq: The observational frequency of the query.
Choices are:
* 'Q' : quarterly
* 'A' : annual
Returns
________
sql_str: String containing the SQL code used to query the specified Compustat database beased on
the start and end date and frequency given.
"""
sql_str = ''
table = f'COMPA_FUND{freq}'
# convert date time object to strings for the SQL query
start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\''
# create argument string
var_str = None
if(vars is None):
var_str = '*'
else:
var_str = self._list_to_sql_str(vars, table)
sql_str += f'SELECT {var_str} FROM {table} WHERE datadate BETWEEN {start_date_str} AND {end_date_str}'
# additional subsetting
if(len(indfmt) != 0): sql_str += f' AND COMPA_FUND{freq}.indfmt IN ({self._list_to_sql_str(indfmt)})'
if(len(datafmt) != 0): sql_str += f' AND COMPA_FUND{freq}.datafmt IN ({self._list_to_sql_str(datafmt)})'
if(len(popsrc) != 0): sql_str += f' AND COMPA_FUND{freq}.popsrc IN ({self._list_to_sql_str(popsrc)})'
if(len(consol) != 0): sql_str += f' AND COMPA_FUND{freq}.consol IN ({self._list_to_sql_str(consol)})'
if(specific_query): sql_str += f' AND COMPA_FUND{freq}.{id_type} IN ({self._list_to_sql_str(ids)})'
return(sql_str)
# INTERNAL METHOD
def _CRSP_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, exchcds: list, shrcds: list, specific_query: bool, id_type: str, ids: list) -> str:
"""
INTERNAL METHOD: Create SQL string used to query the local WRDS CRSP monthly database.
Parameters
___________
start_date: Starting date for the dataset queried.
end_date: Ending date for the dataset queried.
freq: Observational frequency.
Choices are:
* 'D' : daily
* 'M' : monthly
Returns
________
sql_str : str\n
String containing the SQL code used to query the specified CRSP database beased on
the observational frequency and WRDS update frequency of the CRSP database.
Note
_____
Additonal to pulling the daily stock file (dsf) or the monthly stock file (msf) we also pull
the daily or monthly stock events names file for the exchange code and the share code.
"""
# table to query from
sql_str = ''
table = f'CRSP_{freq}'
# convert date time object to strings for the SQL query
start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\''
end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\''
# create argument string
var_str = self._list_to_sql_str(vars, table)
sql_str += f'SELECT {var_str} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}'
# additional subsetting
if(len(exchcds) != 0): sql_str += f' AND exchcd in ({self._list_to_sql_str(exchcds)})'
if(len(shrcds) != 0): sql_str += f' AND shrcd in ({self._list_to_sql_str(shrcds)})'
if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})'
return(sql_str)
# INTERNAL METHOD
def _rf1m_SQL_query(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> str:
"""
INTERNAL METHOD: Create SQL string used to query the Fama-French risk free rate
listed on WRDS CRSP in the FF library. This rate is the
1 month T-Bill rate.
Parameters
___________
start_date: str\n
Starting date for the data being queried.
end_date: str\n
Ending date for the data being queried.
obs_freq: str\n
The observational frequency of the CRSP delisting database being queried.
Choices are:
* 'D' : daily
* 'M' : monthly
* 'A' : annual
Returns
________
sql_str : str\n
String containing the SQL code used to query the risk free rate in the
Fama-French (FF) library on CRSP/WRDS database.
Note
_____
Depending on the observational frequency (obs_freq) given the compounding of the
risk-free rate changes.
"""
# convert date time object to strings for the SQL query
start_date_str = start_date.strftime('%Y-%m-%d')
end_date_str = end_date.strftime('%Y-%m-%d')
# Depending on the frequency supplied the compounding changes
if(obs_freq == 'D'):
sql_1 = 'strftime(\'%d\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%d\', date) AS diff'
sql_2 = 'rf AS cumrf'
library = 'FF_FACTORS_DAILY'
elif(obs_freq == 'M'):
sql_1 = 'strftime(\'%m\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%m\', date) AS diff'
sql_2 = 'rf AS cumrf'
library = 'FF_FACTORS_MONTHLY'
elif(obs_freq == 'A'):
sql_1 = 'strftime(\'%Y\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%Y\', date) AS diff'
sql_2 = 'EXP(SUM(LN(1 + rf)) OVER (PARTITION BY strftime(\'%Y\', date))) - 1 AS cumrf'
library = 'FF_FACTORS_MONTHLY'
else:
cprint.fatal('No valid observational frequency given.', interrupt = True)
sql_dic = {'sql_1' : sql_1, 'sql_2' : sql_2, 'library' : library, 'start_date' : '\'' + start_date_str + '\'', 'end_date' : '\'' + end_date_str + '\''}
sql_str = """
SELECT date, rf
FROM (
SELECT date, {0}, rf, {1}
FROM {2}
WHERE date BETWEEN {3} AND {4}
) AS crsp_rf
WHERE diff != 0 OR diff IS NULL
""".format(sql_dic['sql_1'], sql_dic['sql_2'], sql_dic['library'], sql_dic['start_date'], sql_dic['end_date'])
return(sql_str)
Classes
class QueryWRDS (WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3)
-
Initalizers the QueryWRDS class. During this process all of the files need from WRDS are downloaded to create a local SQL database. Additionally, combined files are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These tables contain all of the variables in the stock file, names file, and delisting file. Additionally, CCM file is created; this file contain the most commonly used CRSP varaiables and the Compustat variables used to make the standard Fama-French anomaly characteristics, the standard anomaly characteristics are created
Parameters
WRDS_username: Personal WRDS username.
local_db_path: default = None; Location to create and read from the local SQL database.
update_all_tables: default = False; If true the local database is deleted and recreated by downloading form WRDS.
tables_to_update: default = []; List of tables to update from WRDS.
update_tolerance: default = 3; Number of quarters the tables can be out of date.
Note
If no 'local_db_path' is given then the WRDS database is created in the current directory
Note
The table names in the local SQL database mirror those found on WRDS but with underscores replcaing periods. Thus, when updating tables need to use the local names of the tables (i.e.) CRSP_MSF instead of CRSP.MSF
Note
Anomally characterisitcs created: * bm: Book-to-Market * pe: Price-to-Earnings * cfp: Cashflow-to-Price * inv: Investment * op: Operating Profitablility
Expand source code
class QueryWRDS: # TODO: # (1) add more detail to string repr # (2) setup update reminder if information is over a year out of date # (3) blocking so it will work on other computers def __init__(self, WRDS_username: str, local_db_path: pathlib.Path = None, update_all_tables: bool = False, tables_to_update: list = [], update_tolerance: int = 3) -> None: """ Initalizers the QueryWRDS class. During this process all of the files need from WRDS are downloaded to create a local SQL database. Additionally, combined files are created. The first files that are created are CRSP_M (monthly) and CRSP_D (daily). These tables contain all of the variables in the stock file, names file, and delisting file. Additionally, CCM file is created; this file contain the most commonly used CRSP varaiables and the Compustat variables used to make the standard Fama-French anomaly characteristics, the standard anomaly characteristics are created Parameters ___________ WRDS_username: Personal WRDS username. local_db_path: default = None; Location to create and read from the local SQL database. update_all_tables: default = False; If true the local database is deleted and recreated by downloading form WRDS. tables_to_update: default = []; List of tables to update from WRDS. update_tolerance: default = 3; Number of quarters the tables can be out of date. Note _____ If no 'local_db_path' is given then the WRDS database is created in the current directory Note _____ The table names in the local SQL database mirror those found on WRDS but with underscores replcaing periods. Thus, when updating tables need to use the local names of the tables (i.e.) CRSP_MSF instead of CRSP.MSF Note _____ Anomally characterisitcs created: * bm: Book-to-Market * pe: Price-to-Earnings * cfp: Cashflow-to-Price * inv: Investment * op: Operating Profitablility """ # list of tables from WRDS to make local WRDS_TABLES = ['FF.FACTORS_DAILY', # FamaFrench Factors daily (used for risk-free rate) 'FF.FACTORS_MONTHLY', # FamaFrench Factors monthly (used for risk-free rate) 'CRSP.CCMXPF_LINKTABLE', # CCM link table used to merge CRSP and Compustat 'CRSP.MSEDELIST', # CRSP monthly delist events 'CRSP.MSF', # CRSP monthly stock file 'CRSP.MSENAMES', # CRSP monthly event file 'COMPA.FUNDA', # Compustat annual observations 'COMPA.FUNDQ'] # Compustat quarterly observations # list of created tables CREATED_TABLES = ['CRSP_M', # CRSP monthly file (i.e. merged CRSP.MSF, CRSP.MSENAMES, CRSP.MSEDELIST) 'CCM'] # CRSP/Compustat merged file # set class attributes self.WRDS_username = WRDS_username if(local_db_path is None): # set location to be in the current directory self.local_db_path = pathlib.Path('WRDS.db') else: # user specified loaction self.local_db_path = local_db_path self.update_all_tables = update_all_tables self.tables_to_update = tables_to_update self.update_tolerance = update_tolerance self.today_date = datetime.date.today().strftime('%Y-%m-%d') # used to update all tables if(self.update_all_tables and self.local_db_path.exists()): cprint.warn('Updating the tables in the local database. This process could take a long time...') os.remove(self.local_db_path) # create sql engine self.sql_engine = sqlalchemy.create_engine('sqlite:///' + str(self.local_db_path)) # list of current tables # check to see if all required tables are present, if not load the ones that are missing inspect = sqlalchemy.inspect(self.sql_engine) self.curr_tables = inspect.get_table_names() # delete tables that should be updated for table_name in self.tables_to_update: if(table_name in self.curr_tables): with self.sql_engine.connect() as conn: if(table_name not in CREATED_TABLES): _ = conn.execute(f"""DROP TABLE {table_name}""") # drop combo files and remake for cr_table in CREATED_TABLES: _ = conn.execute(f'DROP TABLE {cr_table}') else: # only drop the created table and remake _ = conn.execute(f'DROP TABLE {table_name}') table_names = [name.replace('.', '_') for name in WRDS_TABLES] # local table names # check CSV directory for files to include CSV_directory = self.local_db_path.parent / 'CSVtoSQL' for csvfile in os.listdir(CSV_directory): f = os.path.join(CSV_directory, csvfile) if(os.path.isfile(f)): filepath = pathlib.Path(f) tablename = filepath.name.strip('.csv') if(tablename in self.curr_tables): continue s = time.time() cprint.info(f'Adding {filepath.name} to SQL database {self.local_db_path.name}...') subprocess.call(['sqlite3', f'{self.local_db_path}', '.mode csv', f'.import {filepath} {tablename}', '.mode columns']) e = time.time() cprint.info(f'Finished {filepath.name}: {round(e - s, 3)}s') # update current tables self.curr_tables = inspect.get_table_names() # read in the data from WRDS if(not all(elem in self.curr_tables for elem in table_names)): missing_tables = list(set(table_names) - set(inspect.get_table_names())) cprint.warn(f'The following tables are missing from the local database: {missing_tables}. Querying WRDS to add them to the local database.') cprint.info('Connecting to WRDS...') self.WRDS_db = wrds.Connection(username = self.WRDS_username) for table_name in missing_tables: table = table_name.replace('_', '.', 1) print('-------------------------------------') cprint.info(f'Starting {table}') s = time.time() sql_str = '''SELECT * FROM ''' + table # download the data to a dataframe df = self.WRDS_db.raw_sql(sql_str) cprint.ok(f'Dataframe in memory: {asizeof.asizeof(df) / (10 ** 9)}GB') # add end of month column for CRSP_MSEDELIST if(table_name == 'CRSP_MSEDELIST'): df['date'] = df.dlstdt + MonthEnd(0) if(table_name == 'CRSP_DSEDELIST'): df['date'] = df.dlstdt # create date column for merging # write the dataframe to the local sql database df.to_sql(table_name, con = self.sql_engine, if_exists = 'replace', index = False) del df e = time.time() cprint.info(f'Finished {table}: {round(e - s, 3)}s') print('-------------------------------------\n') cprint.info('Raw WRDS files have been added to the local databse.') if('CRSP_M' not in self.curr_tables): cprint.info(f'Creating combined data table CRSP_M...') sf_df = pd.read_sql(f"""SELECT * FROM CRSP_MSF LIMIT 1""", con = self.sql_engine) names_df = pd.read_sql(f"""SELECT * FROM CRSP_MSENAMES LIMIT 1""", con = self.sql_engine) delsit_df = pd.read_sql(f"""SELECT * FROM CRSP_MSEDELIST LIMIT 1""", con = self.sql_engine) vars_to_select = '' for var in list(sf_df.columns): vars_to_select += f'CRSP_MSF.{var}, ' for var in list(set(list(names_df.columns)) - set(list(sf_df.columns))): vars_to_select += f'CRSP_MSENAMES.{var}, ' for var in list(set(list(delsit_df.columns)) - set(list(sf_df.columns)) - set(list(names_df.columns))): vars_to_select += f'CRSP_MSEDELIST.{var}, ' vars_to_select = vars_to_select[:-2] sql_dic = {'vars': vars_to_select} sql_str = '''CREATE TABLE CRSP_M AS SELECT {0} FROM CRSP_MSF LEFT JOIN CRSP_MSENAMES ON CRSP_MSF.permno = CRSP_MSENAMES.permno AND CRSP_MSENAMES.namedt <= CRSP_MSF.date AND CRSP_MSF.date <= CRSP_MSENAMES.nameendt LEFT JOIN CRSP_MSEDELIST ON CRSP_MSF.permno = CRSP_MSEDELIST.permno AND CRSP_MSF.date = CRSP_MSEDELIST.date'''.format(sql_dic['vars']) with self.sql_engine.connect() as conn: _ = conn.execute(sql_str) cprint.info('Combined CRSP tables have been created.') # create merged CRSP and Compustat table if('CCM' not in self.curr_tables): table = 'CCM' cprint.info(f'Creating combined CRSP and Compustat table: {table}') start_date = datetime.date(1900, 6, 30) end_date = datetime.date(2100, 6, 30) # Compustat ------------------------------------------------------------------------------------------- COMP_df = self.query_Compustat(start_date, end_date, 'A', sub_vars = ['ticker'], add_vars = ['years_in', 'fyear', 'revt', 'adjex_f']) COMP_df['year'] = COMP_df.datadate.dt.year # create preferrerd stock COMP_df['ps'] = np.where(COMP_df.pstkrv.isnull(), COMP_df.pstkl, COMP_df.pstkrv) COMP_df.ps = np.where(COMP_df.ps.isnull(), COMP_df.pstk, COMP_df.ps) COMP_df.ps = np.where(COMP_df.ps.isnull(), 0, COMP_df.ps) COMP_df.txditc = COMP_df.txditc.fillna(0) # create book equity COMP_df['be'] = np.where(COMP_df.fyear < 1993, COMP_df.seq + COMP_df.txditc - COMP_df.ps, COMP_df.seq - COMP_df.ps) # earnings COMP_df['earn'] = np.where(~COMP_df.ib.isnull(), COMP_df.ib, np.nan) # operating profitability COMP_df['xp_allnan'] = (COMP_df.cogs.isnull()) & (COMP_df.xsga.isnull()) & (COMP_df.xint.isnull()) COMP_df['profit'] = COMP_df.revt - COMP_df.cogs.fillna(0) - COMP_df.xint.fillna(0) - COMP_df.xsga.fillna(0) COMP_df['op'] = np.where(COMP_df.be + COMP_df.mib != 0, COMP_df.profit / (COMP_df.be + COMP_df.mib.fillna(0)), np.nan) COMP_df.op = np.where(((~COMP_df.op.isnull()) & (~COMP_df.revt.isnull()) & (~COMP_df.xp_allnan)), COMP_df.op, np.nan) # NOTE: Compustat data yields gross outliers in 'op' w/ ratios as large as '1,000'. # To be consistent w/ summary statistics for characteristics provided by Ken French's online library, # values for 'op' outside the 99th percentile are set to missing. COMP_df.op = np.where((COMP_df.op <= COMP_df.op.quantile(0.99)), COMP_df.op, np.nan) # investment try: COMP_df['inv'] = np.log(COMP_df['at']) - np.log(COMP_df.groupby(by = ['gvkey'])['at'].shift(1)) except FloatingPointError: COMP_df['inv'] = (COMP_df['at'] / COMP_df.groupby(by = ['gvkey'])['at'].shift(1)) - 1 COMP_df.inv = np.where(~COMP_df.inv.isnull(), COMP_df.inv, np.nan) # NOTE: Compustat data yields gross outliers in 'inv' w/ percentages as low as '-100%' and as large as '10,000%'. # These outliers are pervasive on the left tail of the distribution. # To be consistent w/ summary statistics for characteristics provided by Ken French's online library, # values for 'inv' outside [15th, 99th] percentiles are winsorized. COMP_df.inv = np.where((COMP_df.inv.quantile(0.15) <= COMP_df.inv), COMP_df.inv, COMP_df.inv.quantile(0.15)) COMP_df.inv = np.where((COMP_df.inv <= COMP_df.inv.quantile(0.99)), COMP_df.inv, COMP_df.inv.quantile(0.99)) # cash flow COMP_df['cf'] = COMP_df.ib + COMP_df.txdi.fillna(0) + COMP_df.dpre.fillna(0) COMP_df.cf = np.where(~COMP_df.cf.isnull(), COMP_df.cf, np.nan) # accruals COMP_df['csho_adj'] = np.where((COMP_df.csho * COMP_df.adjex_f > 0), COMP_df.csho * COMP_df.adjex_f, np.nan) COMP_df['owcap_adj'] = ((COMP_df.act - COMP_df.che) - (COMP_df.lct.fillna(0) - COMP_df.dlc.fillna(0))) / COMP_df.csho_adj COMP_df['d_owcap_adj'] = COMP_df.owcap_adj - COMP_df.groupby(by = ['gvkey'])['owcap_adj'].shift(1) COMP_df['ac'] = np.where(~COMP_df.csho_adj.isnull(), COMP_df.d_owcap_adj / (COMP_df.be / COMP_df.csho_adj), np.nan) # NOTE: Compustat data yields gross outliers in 'ac' for June of each year {t} w/ ratios as low as '-200' and as large as '200'. # To be consistent w/ summary statistics for characteristics provided by Ken French's online library, # values for 'ac' less than '-200' and values for 'ac' larger than '200' are set to missing. COMP_df.ac = np.where(-200 <= COMP_df.ac, COMP_df.ac, np.nan) COMP_df.ac = np.where(COMP_df.ac <= 200, COMP_df.ac, np.nan) # net shares issused COMP_df['ni_csho_adj'] = np.where(COMP_df.csho * COMP_df.adjex_f > 0, COMP_df.csho * COMP_df.adjex_f, np.nan) try: COMP_df['nsi'] = np.log(COMP_df.ni_csho_adj) - np.log(COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1)) except FloatingPointError: COMP_df['nsi'] = (COMP_df.ni_csho_adj / COMP_df.groupby(by = ['gvkey'])['ni_csho_adj'].shift(1)) - 1 COMP_df.nsi = np.where(~COMP_df.nsi.isnull(), COMP_df.nsi, np.nan) # NOTE: Compustat data yields outliers in 'ni' w/ ratios as large as '20'. # To be consistent w/ summary statistics for characteristics provided by Ken French's online library, # values for 'ni' outside the 99.9th percentile are set to missing. COMP_df.nsi = np.where(COMP_df.nsi <= COMP_df.nsi.quantile(0.999), COMP_df.nsi, np.nan) COMP_df = COMP_df.drop(columns = ['owcap_adj', 'xp_allnan']) # CRSP ------------------------------------------------------------------------------------------------ CRSP_df = self.query_CRSP(start_date, end_date, 'M') CRSP_df['jdate'] = CRSP_df.date # create timing columns CRSP_df['year'] = CRSP_df['jdate'].dt.year CRSP_df['month'] = CRSP_df['jdate'].dt.month # turnover (turn) # The turnover (TURN) for each stock is defined the monthly traded volume scaled by the total number of shares outstanding. CRSP_df['turn'] = CRSP_df.vol / CRSP_df.shrout # traded volume in dollars (dvol) # The traded volume in dollars (DVOL) is defined as the number of shares traded in a given month multiplied by the closing stock price. CRSP_df['dvol'] = CRSP_df.vol * CRSP_df.prc.abs() # calculate prior returns # Prior 1-1 is the cummulative return in [t - 1] # Prior 2-12 is the cummulative return from [t - 12] to [t - 2] # Prior 13-60 is the cummulative return from [t - 60] to [t - 13] for ret_typ in ['adjret', 'adjretx']: for typ in [(1, 1), (2, 12), (13, 60)]: name = f'pr{typ[0]}_{typ[1]}' if(ret_typ == 'adjret') else f'prx{typ[0]}_{typ[1]}' CRSP_df[name] = 1 for i in range(typ[0], typ[1] + 1): CRSP_df[f'{ret_typ}_L{i}'] = 1 + CRSP_df.groupby(by = ['permno'])[ret_typ].shift(i) CRSP_df[name] *= CRSP_df[f'{ret_typ}_L{i}'] CRSP_df = CRSP_df.drop(CRSP_df.filter(regex = '_L').columns, axis = 1) CRSP_df[name] -= 1 ### Aggregate Market Cap ### # sum of me across different permno belonging to same permco a given date crsp_summe = CRSP_df.groupby(['jdate','permco'])['me'].sum().reset_index() # largest mktcap within a permco/date crsp_maxme = CRSP_df.groupby(['jdate','permco'])['me'].max().reset_index() # join by jdate/maxme to find the permno CRSP_df = CRSP_df.merge(crsp_maxme, how='inner', on=['jdate','permco','me']) # drop me column and replace with the sum me CRSP_df = CRSP_df.drop(columns = ['me']) # join with sum of me to get the correct market cap info CRSP_df = CRSP_df.merge(crsp_summe, how='inner', on=['jdate','permco']) ### July to June dates CRSP_df['ffdate'] = CRSP_df['jdate'] + MonthEnd(-6) CRSP_df['ffyear'] = CRSP_df['ffdate'].dt.year CRSP_df['ffmonth'] = CRSP_df['ffdate'].dt.month CRSP_df['1+adjretx'] = 1 + CRSP_df.adjretx # cumret by stock and fama-french year CRSP_df['ffyear_cumretx'] = CRSP_df.groupby(by = ['permno', 'ffyear'])['1+adjretx'].cumprod() # lag of ffyear_cumretx CRSP_df['L_ffyear_cumretx'] = CRSP_df.groupby(by = ['permno'])['ffyear_cumretx'].shift(1) # lag market cap CRSP_df['L_me']=CRSP_df.groupby(by = ['permno'])['me'].shift(1) # if first permno then use me/(1+retx) to replace the missing value CRSP_df['months_in'] = CRSP_df.groupby(by = ['permno']).cumcount() CRSP_df.L_me = np.where(CRSP_df.months_in == 0, CRSP_df.me / CRSP_df['1+adjretx'], CRSP_df.L_me) # baseline me mebase = CRSP_df[CRSP_df.ffmonth == 1][['permno','ffyear', 'L_me']].rename(columns={'L_me': 'mebase'}) # merge result back together CRSP_df = CRSP_df.merge(mebase, how = 'left', on = ['permno', 'ffyear']) CRSP_df['wt'] = np.where(CRSP_df.ffmonth == 1, CRSP_df.L_me, CRSP_df.mebase * CRSP_df.L_ffyear_cumretx) # Info as of June & December and merge together for characteristics CRSP_JUN_df = CRSP_df[CRSP_df.month == 6] CRSP_DEC_df = CRSP_df[CRSP_df.month == 12] CRSP_DEC_df = CRSP_DEC_df[['permno','date','jdate','me','year']].rename(columns={'me': 'dec_me'}) CRSP_DEC_df.year += 1 CRSP_DEC_df = CRSP_DEC_df[['permno','year','dec_me']] CRSP_JUN_df = CRSP_JUN_df.merge(CRSP_DEC_df, how = 'inner', on = ['permno', 'year']) CRSP_JUN_df = CRSP_JUN_df[['permno', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'mebase', 'L_me', 'dec_me']] CRSP_JUN_df = CRSP_JUN_df.drop_duplicates() # query the link table link_table_df = self.query_link_table() CCM_df = COMP_df.merge(link_table_df, how='left', on=['gvkey']) CCM_df['jdate'] = CCM_df.year_end + MonthEnd(6) # set link date bounds CCM_df = CCM_df[(CCM_df.jdate >= CCM_df.linkdt) & (CCM_df.jdate <= CCM_df.linkenddt)] # link COMP_df and crsp CCM_JUN_df = CRSP_JUN_df.merge(CCM_df, how = 'inner', on = ['permno', 'jdate']) CCM_JUN_df['ffyear'] = CCM_JUN_df.jdate.dt.year # drop columns before merging CCM_JUN_df = CCM_JUN_df.drop(columns = ['mebase', 'L_me', 'linktype', 'linkprim', 'linkenddt', 'linkdt', 'date', 'jdate', 'shrcd', 'exchcd', 'adjret', 'me', 'wt', 'year', 'permco']) CRSP_df = CRSP_df.drop(columns = ['L_me', 'L_ffyear_cumretx', 'ffyear_cumretx', '1+adjretx', 'mebase', 'jdate']) # merge to monthly CCM_df = CRSP_df.merge(CCM_JUN_df, how = 'left', on = ['permno', 'ffyear']) CCM_df = CCM_df.dropna(subset = ['datadate']) CCM_df = CCM_df.sort_values(by = ['permno', 'date']) # create characterisitcs # book to market CCM_df['ffbm'] = np.where(CCM_df.dec_me != 0, CCM_df.be / CCM_df.dec_me, np.nan) CCM_df['bm'] = np.where(CCM_df.me != 0, CCM_df.be / CCM_df.me, np.nan) # earnings to price CCM_df['ffep'] = np.where(CCM_df.dec_me != 0, CCM_df.earn / CCM_df.dec_me, np.nan) CCM_df['ep'] = np.where(CCM_df.me != 0, CCM_df.earn / CCM_df.me, np.nan) # cashflow to price CCM_df['ffcfp'] = np.where(CCM_df.dec_me != 0, CCM_df.cf / CCM_df.dec_me, np.nan) CCM_df['cfp'] = np.where(CCM_df.me != 0, CCM_df.cf / CCM_df.me, np.nan) # market beta mkt_s = CCM_df.groupby('date').apply(wavg, 'adjret', 'me') mkt_s.name = 'mkt' mkt_s = mkt_s.to_frame().reset_index() CCM_df = CCM_df.merge(mkt_s, how = 'left', on = ['date']) CCM_df['cov'] = CCM_df.groupby(['permno'])[['adjret', 'mkt']].rolling(min_periods = 24, window = 60).cov().unstack()['adjret']['mkt'].reset_index(level = 'permno').rename(columns = {'mkt': 'cov'})[['cov']] CCM_df['var'] = CCM_df.groupby(['permno'])['mkt'].rolling(min_periods = 24, window = 60).var().reset_index(level = 'permno').rename(columns = {'mkt': 'var'})[['var']] CCM_df['beta'] = CCM_df['cov'] / CCM_df['var'] ## additional characteristics # sale-to-price CCM_df['sp'] = np.where(CCM_df.me != 0, CCM_df.sale / CCM_df.me, np.nan) #### ISSUES WITH GROWTH VARS # earnings growth CCM_df['eg'] = CCM_df.earn.pct_change() # grwoth in sales CCM_df['grs'] = CCM_df.sale.pct_change() # growth in book value CCM_df['grb'] = CCM_df.be.pct_change() # growth in cashflow CCM_df['grcf'] = CCM_df.cf.pct_change() def _percentile_rank(df, var): ptiles = list(df[var].quantile(q = list(np.arange(start = 0, step = 0.01, stop = 1)))) df[f'{var}_pr'] = 100 for i in range(99, 0, -1): mask = df[var] < ptiles[i] df.loc[mask, f'{var}_pr'] = i return(df) for var in ['bm', 'sp', 'cfp', 'dp', 'eg', 'grs', 'grcf', 'grb']: CCM_df = _percentile_rank(CCM_df, var) s = time.time() # Pastor-Stambaugh Liquidty Index (psliq) # The Pastor-Stambaugh Liquidity Index (PSLIQ) is defined for each stock as follow. We run one regression for each calendar month based on each stock’s daily return, using the current daily # return in the left-hand side. The right-hand side variables are the lagged daily return as well as the lagged daily return interacted with the lagged traded dollar volume. The coefficient of the # interaction term is the measure of liquidity – for each stock and each month. # e = time.time() print(CCM_df.head(50)) print(f'time = {e - s}') # save merged dataframe to SQL database CCM_df = CCM_df.drop_duplicates() CCM_df.to_sql(table, con = self.sql_engine, if_exists = 'replace', index = False) cprint.ok('CRSP and Compustat merged table has been created.') def __str__(self) -> str: #TODO: print tables and columns from loacl database return(f"WRDS Username: {self.username}") def raw_sql(self, sql_str): """ Allows the user to use raw SQL on the underlying database. Note _____ This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database. """ cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:') response = input() if(response == 'y'): raw_df = pd.read_sql(sql_str, con = self.sql_engine) return(raw_df) else: cprint.info('Operation cancelled.') return(None) def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame: """ Used to query the merged CRSP/Compustat (CCM) table. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. Note _____ The variables that can be queiried for are: 'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60' Note _____ If the frequency is quarterly the variables that can be queiried for are the same as the annual file except for 'pstkrv' and 'pstkl'. Note _____ The defualt variables that are queried for are: 'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60' """ # vars that can be queiried for VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60', 'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc', 'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm', 'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta'] # for annual #VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', # 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', # 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev'] # if no keyword arguments are given then these are the defaults returned DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear'] VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float, 'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float, 'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float, 'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float, 'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int, 'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float, 'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float} ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('permno' not in query_vars): query_vars.insert(0, 'permno') # add date if people forgot if('date' not in query_vars): query_vars.insert(0, 'date') else: if('add_vars' in kwargs): query_vars = DEFAULT_VARS + kwargs['add_vars'] else: query_vars = DEFAULT_VARS if('sub_vars' in kwargs): query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']] if('all_vars' in kwargs): query_vars = VALID_VARS # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it) all_valid = all(elem in VALID_VARS for elem in query_vars) if(not all_valid): incorrect_vars = list(set(query_vars) - set(VALID_VARS)) raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.') specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] ids = kwargs['ids'] ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # I HATE U SEC if(query_vars is None): raw_df.fyear = raw_df.fyear.astype(float) if(not query_vars is None): if('fyear' in query_vars): raw_df.fyear = raw_df.fyear.astype(float) # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end'])) vars_dtypes = {} for var in quried_vars: vars_dtypes[var] = VARS_DATA_TYPE[var] # convert to correct data types raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d') raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d') raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d') raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d') raw_df = raw_df.astype(vars_dtypes) # replace and python objects 'None' to np.nan raw_df = raw_df.fillna(value = np.nan) # reset to original variables, drop duplicates, and reset the index raw_df = raw_df[query_vars] raw_df = raw_df.drop_duplicates() raw_df = raw_df.sort_values(by = ['permno', 'date']) raw_df = raw_df.reset_index(drop = True) # return dataframe return(raw_df) def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame: """ Used to query the raw Compustat tables. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. Note _____ The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make it easier to compute the anomally characterisitcs when creating the combined CCM tables. Note _____ By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks. To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's naming conventions between their annual and quarterly files. Note _____ The defualt variables that are queried for are if the frequency given is annual: 'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib' If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'. Note _____ There is less error checking in this function compared to the other methods in this class because of the large number of variables in Compustat. """ STD_VARS = None if(freq == 'A'): STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp'] else: STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq'] DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float, 'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float} CREATED_VARS = ['years_in'] ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat') # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey') # add date if people forgot if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate') else: if('add_vars' in kwargs): query_vars = STD_VARS + kwargs['add_vars'] else: query_vars = STD_VARS if('sub_vars' in kwargs): sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']] query_vars = [elem for elem in query_vars if elem not in sub_vars] query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars] query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars] query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars] if('all_vars' in kwargs): query_vars = None indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL'] # default: Industrial, Financial datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD'] # default: Standard popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D'] # default: Consolidated consols = kwargs['consols'] if('consols' in kwargs) else ['C'] # default: Consolidated specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] if(id_type == 'ticker'): id_type = 'tic' ids = kwargs['ids'] query_vars_DB = list(set(query_vars) - set(CREATED_VARS)) ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB, indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # rename columns raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'}) # rename the default columns to match the names from the COMPA_FUNDA if(freq == 'Q'): # quarterly compustat # dont balme me for the different names blame compustat raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq', 'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale', 'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib', 'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f', 'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'}) else: # annual compustat # rename columns for consistency raw_df = raw_df.rename(columns = {'dp': 'dpre'}) # make date a datetime.date and align to the end of the year/quarter raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d') if(freq == 'A'): raw_df['year_end'] = raw_df.datadate + YearEnd(0) else: raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0) # I HATE U S&P if('fyear' in query_vars): raw_df.fyear = raw_df.fyear.astype(float) # add years in to compustat if('years_in' in query_vars): raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount() # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['date'])) vars_dtypes = {} for var in quried_vars: if(var in DEFAULT_DTYPES): vars_dtypes[var] = DEFAULT_DTYPES[var] # convert dtypes raw_df = raw_df.fillna(value = np.nan) raw_df = raw_df.astype(vars_dtypes) # sort just for ease of reading raw_df = raw_df.drop_duplicates() sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end'] raw_df = raw_df.sort_values(by = sorting_dims) raw_df = raw_df.reset_index(drop = True) # return the dataframe return(raw_df) def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame: """ Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * M: quarterly * D: annual adj_stocksplit: default = True; Whether or not to adjust for a stock split event. Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. exchcds: list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ) shrcds: list; The share codes to use for querying (default: US Common Stock) Note _____ If the frequency is monthly then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt' Note _____ If the frequency is daily then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd' Note _____ The variabels that are created from CRSP primatives are: * 'me': Market Equity (millions) * 'adjret': Returns adjusted for delisting events * 'adjretx': Returns adjusted for delisting events ex. dividend * 'dvd': Dividend (uses 'adjret' and 'adjretx' to calculate) * 'dp': Dividend-to-Price Ratio Note _____ This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 and an exchange code of 1, 2, or 3. """ # variables that can be queried for STD_VARS = None if(freq == 'M'): STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'] else: STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd'] CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx'] VALID_VARS = STD_VARS + CREATE_VARS # if no keyword arguments are given then these are the defaults returned DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol'] # variable data types VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str, 'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str, 'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str, 'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float, 'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float, 'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float, 'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float, 'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float} ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP') # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('permno' not in query_vars): query_vars.insert(0, 'permno') # add date if people forgot if('date' not in query_vars): query_vars.insert(0, 'date') else: if('add_vars' in kwargs): query_vars = DEFAULT_VARS + kwargs['add_vars'] else: query_vars = DEFAULT_VARS if('sub_vars' in kwargs): query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']] if('all_vars' in kwargs): query_vars = VALID_VARS # used for dataframe formatting at the end og_vars = query_vars.copy() # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it) all_valid = all(elem in VALID_VARS for elem in query_vars) if(not all_valid): incorrect_vars = list(set(query_vars) - set(VALID_VARS)) raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.') # always adjust for stock splits (can disable this) if(adj_stocksplit): if('prc' not in query_vars): query_vars.append('prc') if('cfacpr' not in query_vars): query_vars.append('cfacpr') if('shrout' not in query_vars): query_vars.append('shrout') if('cfacshr' not in query_vars): query_vars.append('cfacshr') # make sure if created variables are being queiried for then add the variables needed to create them if('me' in query_vars): if('prc' not in query_vars): query_vars.append('prc') if('shrout' not in query_vars): query_vars.append('shrout') if('dp' in query_vars): if('dvd' not in query_vars): query_vars.append('dvd') if('dvd' in query_vars): if('adjret' not in query_vars): query_vars.append('adjret') if('adjretx' not in query_vars): query_vars.append('adjretx') if('prc' not in query_vars): query_vars.append('prc') if('adjret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('dlret' not in query_vars): query_vars.append('dlret') if('adjretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('dlretx' not in query_vars): query_vars.append('dlretx') if('cumret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('cumretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('adjcumret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('dlret' not in query_vars): query_vars.append('dlret') if('adjcumretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('dlretx' not in query_vars): query_vars.append('dlretx') exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11] # default: US-based common stock specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] ids = kwargs['ids'] # created vars are not in the table so remove them db_vars = [var for var in query_vars if var not in CREATE_VARS] ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # I HATE U SEC DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm'] for var in DOWNCAST_VARS: if(var in query_vars): raw_df[var] = raw_df[var].astype('Int64') # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt'])) vars_dtypes = {} for var in quried_vars: vars_dtypes[var] = VARS_DATA_TYPE[var] # convert dates to datetime.dates and align to end of month raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d') if(freq == 'M'): raw_df.date += MonthEnd(0) if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d') if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d') if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d') if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d') if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d') if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d') # make sure that the data is the correct type raw_df = raw_df.astype(vars_dtypes) # replace and python objects 'None' to np.nan raw_df = raw_df.fillna(value = np.nan) # adjust for stock splits if(adj_stocksplit): raw_df.prc /= raw_df.cfacpr raw_df.shrout *= raw_df.cfacshr # Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP. # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html if('me' in query_vars): raw_df['me'] = raw_df.prc.abs() * raw_df.shrout # convert market equity to $millions raw_df.me /= 1e3 # adjust for delisting return if('adjret' in query_vars): raw_df.dlret = raw_df.dlret.fillna(value = 0.0) raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1 if('adjretx' in query_vars): raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0) raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1 # create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order if('adjret' in query_vars and 'adjretx' in query_vars): raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs() # create cummulative returns if('cumret' in query_vars): raw_df['1+ret'] = 1 + raw_df.ret raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod() raw_df = raw_df.drop(columns = ['1+ret']) if('cumretx' in query_vars): raw_df['1+retx'] = 1 + raw_df.ret raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod() raw_df = raw_df.drop(columns = ['1+retx']) if('adjcumret' in query_vars): raw_df['1+adjret'] = 1 + raw_df.ret raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod() raw_df = raw_df.drop(columns = ['1+adjret']) if('adjcumret' in query_vars): raw_df['1+adjretx'] = 1 + raw_df.ret raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod() raw_df = raw_df.drop(columns = ['1+adjretx']) # Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1 # to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns # from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25. # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html # NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months. if('dp' in query_vars): if(freq == 'M'): min_periods = 7 window = 12 else: min_periods = 147 # 252 days / 12 months * 7 months window = 252 raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']] raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs() raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp) raw_df = raw_df.drop(columns = ['cumdvd']) # reset to original variables, drop duplicates, and reset the index raw_df = raw_df[og_vars] raw_df = raw_df.drop_duplicates() raw_df = raw_df.sort_values(by = ['permno', 'date']) raw_df = raw_df.reset_index(drop = True) # return the raw dataframe and path where it was saved return(raw_df) def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame: """ Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate. Parameters ___________ start_date: datetime.date\n Starting date of the dataset being queried. end_date: datetime.date\n Ending date of the dataset being queried. obs_freq: str\n The observational frequency of the CRSP database being queried. Choices are: * 'D' : daily * 'M' : monthly * 'A' : annually Returns ________ full_df: pd.DataFrame\n Risk-free rate data. Note _____ The dataframe returned makes adjustments for NYSE holidays during compounding. Note _____ List of queried CRSP variables:\n * date : Date of observation * rf : Risk-free rate """ # Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't # coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest. if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()): start_date = (start_date + MonthBegin(-1)).date() # load in dataframe raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine) # convert dates to datetimes raw_df['date'] = pd.to_datetime(raw_df['date']) # Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency. if(obs_freq == 'M'): raw_df['date'] = raw_df['date'] + MonthEnd(0) elif(obs_freq == 'A'): raw_df['date'] = raw_df['date'] + YearEnd(0) # return the raw dataframe return(raw_df) def query_link_table(self) -> pd.DataFrame: """ Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to Compustat companies on permno and gvkey. Returns ________ raw_df: pd.DataFrame\n The raw dataframe pulled from local WRDS database. Note _____ Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable. """ sql_str = """ SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt FROM CRSP_CCMXPF_LINKTABLE WHERE substr(linktype, 1, 1) = 'L' AND (linkprim = 'C' or linkprim = 'P') """ # read in raw dataframe from local database raw_df = pd.read_sql(sql_str, con = self.sql_engine) # convert permno and permco to string raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer') raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer') # convert identifiers to strings raw_df.gvkey = raw_df.gvkey.astype(str) raw_df.permno = raw_df.permno.astype(str) raw_df.permco = raw_df.permco.astype(str) # if linkenddt is missing the set to todays date raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date()) # convert to datetimes raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]') raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]') # return the raw dataframe return(raw_df) # ---------------------------------------------------------------------------------------------------------------------------- # INTERNAL METHODS (class <QueryWRDS>) # # These are internal methods and should only be called within this class. Functionality and accuracy of these methods cannot # garunteed if they are called outside of this class. # ---------------------------------------------------------------------------------------------------------------------------- # INTERNAL METHOD def _list_to_sql_str(self, lst: list, table: str = None) -> str: res = '' for var in lst: if(table is None): res += f'\'{var}\', ' else: res += f'{table}.{var}, ' res = res[:-2] return(res) # INTERNAL METHOD def _CCM_sql_query(self, start_date: datetime.date, end_date: datetime.date, vars: list, specific_query: bool, id_type: str, ids: list): sql_str = '' table = 'CCM' # convert date time object to strings for the SQL query start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\'' end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\'' # create argument string sql_str += f'SELECT {self._list_to_sql_str(vars, table)} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}' # additional subsetting if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})' return(sql_str) # INTERNAL METHOD def _compustat_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, indfmt: list, datafmt: list, popsrc: list, consol: list, specific_query: bool, id_type: str, ids: list) -> str: """ INTERNAL METHOD: Create SQL string used to query the WRDS Compustat database. Parameters ___________ start_date: Starting date for the dataset queried. end_date: Ending date for the dataset queried. freq: The observational frequency of the query. Choices are: * 'Q' : quarterly * 'A' : annual Returns ________ sql_str: String containing the SQL code used to query the specified Compustat database beased on the start and end date and frequency given. """ sql_str = '' table = f'COMPA_FUND{freq}' # convert date time object to strings for the SQL query start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\'' end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\'' # create argument string var_str = None if(vars is None): var_str = '*' else: var_str = self._list_to_sql_str(vars, table) sql_str += f'SELECT {var_str} FROM {table} WHERE datadate BETWEEN {start_date_str} AND {end_date_str}' # additional subsetting if(len(indfmt) != 0): sql_str += f' AND COMPA_FUND{freq}.indfmt IN ({self._list_to_sql_str(indfmt)})' if(len(datafmt) != 0): sql_str += f' AND COMPA_FUND{freq}.datafmt IN ({self._list_to_sql_str(datafmt)})' if(len(popsrc) != 0): sql_str += f' AND COMPA_FUND{freq}.popsrc IN ({self._list_to_sql_str(popsrc)})' if(len(consol) != 0): sql_str += f' AND COMPA_FUND{freq}.consol IN ({self._list_to_sql_str(consol)})' if(specific_query): sql_str += f' AND COMPA_FUND{freq}.{id_type} IN ({self._list_to_sql_str(ids)})' return(sql_str) # INTERNAL METHOD def _CRSP_SQL_query(self, start_date: datetime.date, end_date: datetime.date, freq: str, vars: list, exchcds: list, shrcds: list, specific_query: bool, id_type: str, ids: list) -> str: """ INTERNAL METHOD: Create SQL string used to query the local WRDS CRSP monthly database. Parameters ___________ start_date: Starting date for the dataset queried. end_date: Ending date for the dataset queried. freq: Observational frequency. Choices are: * 'D' : daily * 'M' : monthly Returns ________ sql_str : str\n String containing the SQL code used to query the specified CRSP database beased on the observational frequency and WRDS update frequency of the CRSP database. Note _____ Additonal to pulling the daily stock file (dsf) or the monthly stock file (msf) we also pull the daily or monthly stock events names file for the exchange code and the share code. """ # table to query from sql_str = '' table = f'CRSP_{freq}' # convert date time object to strings for the SQL query start_date_str = '\'' + start_date.strftime('%Y-%m-%d') + '\'' end_date_str = '\'' + end_date.strftime('%Y-%m-%d') + '\'' # create argument string var_str = self._list_to_sql_str(vars, table) sql_str += f'SELECT {var_str} FROM {table} WHERE date BETWEEN {start_date_str} AND {end_date_str}' # additional subsetting if(len(exchcds) != 0): sql_str += f' AND exchcd in ({self._list_to_sql_str(exchcds)})' if(len(shrcds) != 0): sql_str += f' AND shrcd in ({self._list_to_sql_str(shrcds)})' if(specific_query): sql_str += f' AND {id_type} IN ({self._list_to_sql_str(ids)})' return(sql_str) # INTERNAL METHOD def _rf1m_SQL_query(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> str: """ INTERNAL METHOD: Create SQL string used to query the Fama-French risk free rate listed on WRDS CRSP in the FF library. This rate is the 1 month T-Bill rate. Parameters ___________ start_date: str\n Starting date for the data being queried. end_date: str\n Ending date for the data being queried. obs_freq: str\n The observational frequency of the CRSP delisting database being queried. Choices are: * 'D' : daily * 'M' : monthly * 'A' : annual Returns ________ sql_str : str\n String containing the SQL code used to query the risk free rate in the Fama-French (FF) library on CRSP/WRDS database. Note _____ Depending on the observational frequency (obs_freq) given the compounding of the risk-free rate changes. """ # convert date time object to strings for the SQL query start_date_str = start_date.strftime('%Y-%m-%d') end_date_str = end_date.strftime('%Y-%m-%d') # Depending on the frequency supplied the compounding changes if(obs_freq == 'D'): sql_1 = 'strftime(\'%d\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%d\', date) AS diff' sql_2 = 'rf AS cumrf' library = 'FF_FACTORS_DAILY' elif(obs_freq == 'M'): sql_1 = 'strftime(\'%m\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%m\', date) AS diff' sql_2 = 'rf AS cumrf' library = 'FF_FACTORS_MONTHLY' elif(obs_freq == 'A'): sql_1 = 'strftime(\'%Y\', LEAD(date) OVER (ORDER BY date)) - strftime(\'%Y\', date) AS diff' sql_2 = 'EXP(SUM(LN(1 + rf)) OVER (PARTITION BY strftime(\'%Y\', date))) - 1 AS cumrf' library = 'FF_FACTORS_MONTHLY' else: cprint.fatal('No valid observational frequency given.', interrupt = True) sql_dic = {'sql_1' : sql_1, 'sql_2' : sql_2, 'library' : library, 'start_date' : '\'' + start_date_str + '\'', 'end_date' : '\'' + end_date_str + '\''} sql_str = """ SELECT date, rf FROM ( SELECT date, {0}, rf, {1} FROM {2} WHERE date BETWEEN {3} AND {4} ) AS crsp_rf WHERE diff != 0 OR diff IS NULL """.format(sql_dic['sql_1'], sql_dic['sql_2'], sql_dic['library'], sql_dic['start_date'], sql_dic['end_date']) return(sql_str)
Methods
def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) ‑> pandas.core.frame.DataFrame
-
Used to query the merged CRSP/Compustat (CCM) table.
Parameters
start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual
Keyword Arguments
vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for.
Note
The variables that can be queiried for are: 'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60'
Note
If the frequency is quarterly the variables that can be queiried for are the same as the annual file except for 'pstkrv' and 'pstkl'.
Note
The defualt variables that are queried for are: 'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe',
'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60'Expand source code
def query_CCM(self, start_date: datetime.date, end_date: datetime.date, **kwargs) -> pd.DataFrame: """ Used to query the merged CRSP/Compustat (CCM) table. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. Note _____ The variables that can be queiried for are: 'gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'shrcd', 'exchcd', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev', 'pr2_12', 'pr1_1', 'pr13_60', 'prx2_12', 'prx1_1', 'prx13_60' Note _____ If the frequency is quarterly the variables that can be queiried for are the same as the annual file except for 'pstkrv' and 'pstkl'. Note _____ The defualt variables that are queried for are: 'date', 'gvkey', 'permno', 'permco', 'ticker', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'op', 'PR2_12', 'PR1_1', 'PR13_60' """ # vars that can be queiried for VALID_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'dp', 'year', 'month', 'pr1_1', 'pr2_12', 'pr13_60', 'prx1_1', 'prx2_12', 'prx13_60', 'me', 'ffdate', 'ffyear', 'ffmonth', 'months_in', 'wt', 'dec_me', 'dltt', 'mib', 'revt', 'csho', 'adjex_f', 'act', 'xint', 'pstk', 'txdi', 'gvkey', 'ib', 'xsga', 'dlc', 'ceq', 'che', 'datadate', 'txdc', 'dpc', 'ibc', 'fyear', 'pstkl', 'teq', 'cogs', 'pstkrv', 'lct', 'dpre', 'txditc', 'seq', 'at', 'sale', 'year_end', 'years_in', 'ps', 'be', 'earn', 'profit', 'op', 'inv', 'cf', 'csho_adj', 'd_owcap_adj', 'ac', 'ni_csho_adj', 'nsi', 'ffbm', 'bm', 'ffep', 'ep', 'ffcfp', 'cfp', 'beta'] # for annual #VALID_VARS = ['gvkey', 'date', 'fyear', 'at', 'revt', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dpre', 'che', 'dlc', 'ceq', 'seq', # 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib', 'cf', 'permno', 'permco', 'ticker', 'prc', 'shrout', # 'adjret', 'adjretx', 'me', 'dp', 'ps', 'be', 'bm', 'pe', 'cfp', 'inv', 'profit', 'op', 'lev'] # if no keyword arguments are given then these are the defaults returned DEFAULT_VARS = ['date', 'gvkey', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'datadate', 'year_end', 'ffdate', 'prc', 'shrout', 'adjret', 'adjretx', 'me', 'wt', 'dp', 'be', 'bm', 'ffbm', 'ep', 'ffep', 'cfp', 'ffcfp', 'inv', 'op', 'pr2_12', 'pr1_1', 'pr13_60', 'beta', 'ac', 'nsi', 'years_in', 'months_in', 'month', 'ffyear'] VARS_DATA_TYPE = {'permno': str, 'permco': str, 'ticker': str, 'shrcd': str, 'exchcd': str, 'prc': float, 'shrout': float, 'adjret': float, 'adjretx': float, 'adjcumret': float, 'adjcumretx': float, 'dp': float, 'year': int, 'month': int, 'pr1_1': float, 'pr2_12': float, 'pr13_60': float, 'prx1_1': float, 'prx2_12': float, 'prx13_60': float, 'me': float, 'ffyear': int, 'ffmonth': int, 'months_in': int, 'wt': float, 'dec_me': float, 'dltt': float, 'mib': float, 'revt': float, 'csho': float, 'adjex_f': float, 'act': float, 'xint': float, 'pstk': float, 'txdi': float, 'gvkey': str, 'ib': float, 'xsga': float, 'dlc': float, 'ceq': float, 'che': float, 'txdc': float, 'dpc': float, 'ibc': float, 'fyear': int, 'pstkl': float, 'teq': float, 'cogs': float, 'pstkrv': float, 'lct': float, 'dpre': float, 'txditc': float, 'seq': float, 'at': float, 'sale': float, 'years_in': int, 'ps': float, 'be': float, 'earn': float, 'profit': float, 'op': float, 'inv': float, 'cf': float, 'csho_adj': float, 'd_owcap_adj': float, 'ac': float, 'ni_csho_adj': float, 'nsi': float, 'ffbm': float, 'bm': float, 'ffep': float, 'ep': float, 'ffcfp': float, 'cfp': float, 'beta': float} ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('permno' not in query_vars): query_vars.insert(0, 'permno') # add date if people forgot if('date' not in query_vars): query_vars.insert(0, 'date') else: if('add_vars' in kwargs): query_vars = DEFAULT_VARS + kwargs['add_vars'] else: query_vars = DEFAULT_VARS if('sub_vars' in kwargs): query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']] if('all_vars' in kwargs): query_vars = VALID_VARS # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it) all_valid = all(elem in VALID_VARS for elem in query_vars) if(not all_valid): incorrect_vars = list(set(query_vars) - set(VALID_VARS)) raise Exception(f'Variables {incorrect_vars} cannot be queried from the combined CRSP/Compustat merged table. The CCM table does not contain all of the variables that are in CRSP and Compustat.') specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] ids = kwargs['ids'] ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._CCM_sql_query(start_date, end_date, vars = query_vars, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # I HATE U SEC if(query_vars is None): raw_df.fyear = raw_df.fyear.astype(float) if(not query_vars is None): if('fyear' in query_vars): raw_df.fyear = raw_df.fyear.astype(float) # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['date', 'datadate', 'ffdate', 'year_end'])) vars_dtypes = {} for var in quried_vars: vars_dtypes[var] = VARS_DATA_TYPE[var] # convert to correct data types raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d') raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d') raw_df.ffdate = pd.to_datetime(raw_df.ffdate, format = '%Y-%m-%d') raw_df.year_end = pd.to_datetime(raw_df.year_end, format = '%Y-%m-%d') raw_df = raw_df.astype(vars_dtypes) # replace and python objects 'None' to np.nan raw_df = raw_df.fillna(value = np.nan) # reset to original variables, drop duplicates, and reset the index raw_df = raw_df[query_vars] raw_df = raw_df.drop_duplicates() raw_df = raw_df.sort_values(by = ['permno', 'date']) raw_df = raw_df.reset_index(drop = True) # return dataframe return(raw_df)
def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) ‑> pandas.core.frame.DataFrame
-
Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for.
Parameters
start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * M: quarterly * D: annual adj_stocksplit: default = True; Whether or not to adjust for a stock split event.
Keyword Arguments
vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. exchcds: list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ) shrcds: list; The share codes to use for querying (default: US Common Stock)
Note
If the frequency is monthly then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'
Note
If the frequency is daily then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd'
Note
The variabels that are created from CRSP primatives are: * 'me': Market Equity (millions) * 'adjret': Returns adjusted for delisting events * 'adjretx': Returns adjusted for delisting events ex. dividend * 'dvd': Dividend (uses 'adjret' and 'adjretx' to calculate) * 'dp': Dividend-to-Price Ratio
Note
This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 and an exchange code of 1, 2, or 3.
Expand source code
def query_CRSP(self, start_date: datetime.date, end_date: datetime.date, freq: str, adj_stocksplit: bool = True, **kwargs) -> pd.DataFrame: """ Used to query the raw CRSP files. Additonally, variables can be created and stock splits can be adjusted for. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * M: quarterly * D: annual adj_stocksplit: default = True; Whether or not to adjust for a stock split event. Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. exchcds: list; The exchange codes to use for querying (default: NYSE, NYSE Market (formally AMEX), and NASDAQ) shrcds: list; The share codes to use for querying (default: US Common Stock) Note _____ If the frequency is monthly then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt' Note _____ If the frequency is daily then the variables that can be quiried for are: 'date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd' Note _____ The variabels that are created from CRSP primatives are: * 'me': Market Equity (millions) * 'adjret': Returns adjusted for delisting events * 'adjretx': Returns adjusted for delisting events ex. dividend * 'dvd': Dividend (uses 'adjret' and 'adjretx' to calculate) * 'dp': Dividend-to-Price Ratio Note _____ This function defaults to querying for all companies that are consistent with Famas defintions. That is to say assets with a share code of 10 or 11 and an exchange code of 1, 2, or 3. """ # variables that can be queried for STD_VARS = None if(freq == 'M'): STD_VARS = ['date', 'permno', 'shrout', 'altprcdt', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'spread', 'altprc', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt'] else: STD_VARS = ['date', 'permno', 'shrout', 'bidlo', 'bid', 'exchcd', 'ask', 'issuno', 'comnam', 'retx', 'hexcd', 'shrcls', 'shrcd', 'namedt', 'compno', 'nameendt', 'askhi', 'primexch', 'permco', 'ret', 'tsymbol', 'secstat', 'prc', 'hsiccd', 'naics', 'cfacshr', 'cusip', 'vol', 'siccd', 'cfacpr', 'trdstat', 'ticker', 'ncusip', 'dlpdt', 'acperm', 'dlretx', 'accomp', 'dlprc', 'nextdt', 'dlstcd', 'dlstdt', 'dlret', 'nwcomp', 'nwperm', 'dlamt', 'openprc', 'numtrd'] CREATE_VARS = ['me', 'adjret', 'adjretx', 'dvd', 'dp', 'cumret', 'cumretx', 'adjcumret', 'adjcumretx'] VALID_VARS = STD_VARS + CREATE_VARS # if no keyword arguments are given then these are the defaults returned DEFAULT_VARS = ['date', 'permno', 'permco', 'ticker', 'shrcd', 'exchcd', 'prc', 'shrout', 'adjret', 'adjretx', 'adjcumret', 'adjcumretx', 'me', 'dp', 'vol'] # variable data types VARS_DATA_TYPE = {'cusip': str, 'permno': str, 'permco' : str, 'comnam': str, 'compno': str, 'ticker': str, 'primexch': str, 'tsymbol': str, 'secstat': str, 'hsiccd': str, 'naics': str, 'siccd': str, 'trdstat': str, 'ncusip': str, 'shrcd': str, 'exchcd': str, 'issuno': str, 'hexcd': str, 'shrcls': str, 'ret': float, 'retx': float, 'shrout': float, 'prc': float, 'cfacshr': float, 'cfacpr': float, 'bidlo': float, 'bid': float, 'ask': float, 'askhi': float, 'spread': float, 'altprc': float, 'vol': float, 'dlstdt': str, 'dlstcd': str, 'nwperm': str, 'nwcomp': str, 'nextdt': str, 'dlamt': float, 'dlretx': float, 'dlprc': float, 'dlpdt': str, 'dlret': float, 'acperm': str, 'accomp': str, 'me': float, 'adjret': float, 'adjretx': float, 'dvd': float, 'adjdvd': float, 'dp': float, 'openprc': float, 'numtrd': float, 'cumret': float, 'cumretx': float, 'adjcumret': float, 'adjcumretx': float} ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# if(freq not in ['D', 'M']): raise Exception('Invlaid frequency given to query_CRSP') # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('permno' not in query_vars): query_vars.insert(0, 'permno') # add date if people forgot if('date' not in query_vars): query_vars.insert(0, 'date') else: if('add_vars' in kwargs): query_vars = DEFAULT_VARS + kwargs['add_vars'] else: query_vars = DEFAULT_VARS if('sub_vars' in kwargs): query_vars = [elem for elem in query_vars if elem not in kwargs['sub_vars']] if('all_vars' in kwargs): query_vars = VALID_VARS # used for dataframe formatting at the end og_vars = query_vars.copy() # make sure that all vars are valid to be quieired (if your every actually reading these comments u know that i cant spell queried without googleing it) all_valid = all(elem in VALID_VARS for elem in query_vars) if(not all_valid): incorrect_vars = list(set(query_vars) - set(VALID_VARS)) raise Exception(f'Variables {incorrect_vars} cannot be queried from CRSP.') # always adjust for stock splits (can disable this) if(adj_stocksplit): if('prc' not in query_vars): query_vars.append('prc') if('cfacpr' not in query_vars): query_vars.append('cfacpr') if('shrout' not in query_vars): query_vars.append('shrout') if('cfacshr' not in query_vars): query_vars.append('cfacshr') # make sure if created variables are being queiried for then add the variables needed to create them if('me' in query_vars): if('prc' not in query_vars): query_vars.append('prc') if('shrout' not in query_vars): query_vars.append('shrout') if('dp' in query_vars): if('dvd' not in query_vars): query_vars.append('dvd') if('dvd' in query_vars): if('adjret' not in query_vars): query_vars.append('adjret') if('adjretx' not in query_vars): query_vars.append('adjretx') if('prc' not in query_vars): query_vars.append('prc') if('adjret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('dlret' not in query_vars): query_vars.append('dlret') if('adjretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('dlretx' not in query_vars): query_vars.append('dlretx') if('cumret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('cumretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('adjcumret' in query_vars): if('ret' not in query_vars): query_vars.append('ret') if('dlret' not in query_vars): query_vars.append('dlret') if('adjcumretx' in query_vars): if('retx' not in query_vars): query_vars.append('retx') if('dlretx' not in query_vars): query_vars.append('dlretx') exchcds = kwargs['exchcds'] if('exchcds' in kwargs) else [1, 2, 3] # default: NYSE, NYSE MKT, NASDAQ shrcds = kwargs['shrcds'] if('shrcds' in kwargs) else [10, 11] # default: US-based common stock specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] ids = kwargs['ids'] # created vars are not in the table so remove them db_vars = [var for var in query_vars if var not in CREATE_VARS] ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._CRSP_SQL_query(start_date, end_date, freq, vars = db_vars, exchcds = exchcds, shrcds = shrcds, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # I HATE U SEC DOWNCAST_VARS = ['permno', 'permco', 'exchcd', 'issuno', 'hexcd', 'shrcd', 'compno', 'hsiccd', 'naics', 'siccd', 'acperm', 'accomp', 'dlstcd', 'nwcomp', 'nwperm'] for var in DOWNCAST_VARS: if(var in query_vars): raw_df[var] = raw_df[var].astype('Int64') # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['altprcdt', 'date', 'nameendt', 'namedt', 'dlstdt', 'nextdt', 'dlpdt'])) vars_dtypes = {} for var in quried_vars: vars_dtypes[var] = VARS_DATA_TYPE[var] # convert dates to datetime.dates and align to end of month raw_df.date = pd.to_datetime(raw_df.date, format = '%Y-%m-%d') if(freq == 'M'): raw_df.date += MonthEnd(0) if('altprcdt' in query_vars): raw_df.altprcdt = pd.to_datetime(raw_df.altprcdt, format = '%Y-%m-%d') if('nameendt' in query_vars): raw_df.nameendt = pd.to_datetime(raw_df.nameendt, format = '%Y-%m-%d') if('namedt' in query_vars): raw_df.namedt = pd.to_datetime(raw_df.namedt, format = '%Y-%m-%d') if('dlstdt' in query_vars): raw_df.dlstdt = pd.to_datetime(raw_df.dlstdt, format = '%Y-%m-%d') if('nextdt' in query_vars): raw_df.nextdt = pd.to_datetime(raw_df.nextdt, format = '%Y-%m-%d') if('dlpdt' in query_vars): raw_df.dlpdt = pd.to_datetime(raw_df.dlpdt, format = '%Y-%m-%d') # make sure that the data is the correct type raw_df = raw_df.astype(vars_dtypes) # replace and python objects 'None' to np.nan raw_df = raw_df.fillna(value = np.nan) # adjust for stock splits if(adj_stocksplit): raw_df.prc /= raw_df.cfacpr raw_df.shrout *= raw_df.cfacshr # Market Equity. Market equity (size) is price times shares outstanding. Price and shares outstanding from CRSP. # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html if('me' in query_vars): raw_df['me'] = raw_df.prc.abs() * raw_df.shrout # convert market equity to $millions raw_df.me /= 1e3 # adjust for delisting return if('adjret' in query_vars): raw_df.dlret = raw_df.dlret.fillna(value = 0.0) raw_df['adjret'] = ((1 + raw_df.ret) * (1 + raw_df.dlret)) - 1 if('adjretx' in query_vars): raw_df.dlretx = raw_df.dlretx.fillna(value = 0.0) raw_df['adjretx'] = ((1 + raw_df.retx) * (1 + raw_df.dlretx)) - 1 # create dividends paid using 'adjret' and 'adjretx' then 'ret' and 'retx' in that order if('adjret' in query_vars and 'adjretx' in query_vars): raw_df['dvd'] = (raw_df.adjret - raw_df.adjretx) * raw_df.groupby(['permco'])['prc'].shift(1).abs() # create cummulative returns if('cumret' in query_vars): raw_df['1+ret'] = 1 + raw_df.ret raw_df['cumret'] = raw_df.groupby(by = ['permno'])['1+ret'].cumprod() raw_df = raw_df.drop(columns = ['1+ret']) if('cumretx' in query_vars): raw_df['1+retx'] = 1 + raw_df.ret raw_df['cumretx'] = raw_df.groupby(by = ['permno'])['1+retx'].cumprod() raw_df = raw_df.drop(columns = ['1+retx']) if('adjcumret' in query_vars): raw_df['1+adjret'] = 1 + raw_df.ret raw_df['adjcumret'] = raw_df.groupby(by = ['permno'])['1+adjret'].cumprod() raw_df = raw_df.drop(columns = ['1+adjret']) if('adjcumret' in query_vars): raw_df['1+adjretx'] = 1 + raw_df.ret raw_df['adjcumretx'] = raw_df.groupby(by = ['permno'])['1+adjretx'].cumprod() raw_df = raw_df.drop(columns = ['1+adjretx']) # Dividend Yield. The dividend yield used to form portfolios in June of year t is the total dividends paid from July of t-1 # to June of t per dollar of equity in June of t. The dividend yield is computed using the with and without dividend returns # from CRSP, as described in Fama and French, 1988, “Dividend yields and expected stock returns,” Journal of Financial Economics 25. # SOURCE: http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/Data_Library/variable_definitions.html # NOTE: Following Fama Fench the dividend price ratio uses the last year of dividends paid out if possible with a minimum 7 months. if('dp' in query_vars): if(freq == 'M'): min_periods = 7 window = 12 else: min_periods = 147 # 252 days / 12 months * 7 months window = 252 raw_df['cumdvd'] = raw_df.groupby(['permno'])['dvd'].rolling(min_periods = min_periods, window = window).sum().reset_index(level = 'permno')[['dvd']] raw_df['dp'] = raw_df.cumdvd / raw_df.prc.abs() raw_df.dp = np.where((raw_df.dp.isnull()) | (raw_df.dp < 0), np.nan, raw_df.dp) raw_df = raw_df.drop(columns = ['cumdvd']) # reset to original variables, drop duplicates, and reset the index raw_df = raw_df[og_vars] raw_df = raw_df.drop_duplicates() raw_df = raw_df.sort_values(by = ['permno', 'date']) raw_df = raw_df.reset_index(drop = True) # return the raw dataframe and path where it was saved return(raw_df)
def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) ‑> pandas.core.frame.DataFrame
-
Used to query the raw Compustat tables.
Parameters
start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual
Keyword Arguments
vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for.
Note
The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make it easier to compute the anomally characterisitcs when creating the combined CCM tables.
Note
By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks. To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's naming conventions between their annual and quarterly files.
Note
The defualt variables that are queried for are if the frequency given is annual: 'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib'
If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'.
Note
There is less error checking in this function compared to the other methods in this class because of the large number of variables in Compustat.
Expand source code
def query_Compustat(self, start_date: datetime.date, end_date: datetime.date, freq: str, **kwargs) -> pd.DataFrame: """ Used to query the raw Compustat tables. Parameters ___________ start_date: The starting date of the data query. end_date: The ending date of the data query. freq: The frequency of the data query. Choices are: * Q: quarterly * A: annual Keyword Arguments __________________ vars: list; The variables to query for. add_vars: list; Additional variables to query for ontop of the default variables. sub_vars: list; Variables to remove from the default variables. all_vars: bool; Set to true to query for all variables in the table. id_type: str; Type of ID used to query for specific assets. Choices are: * ticker * gvkey * permno * cusip * permco ids: list; The ids of type 'id_type' to query for. Note _____ The defualt variables that are queired for from the quarterly file have their names changed to mirror those in the annual file. In most cases this means removing a 'q' at the end of the variable name. For example, in the annual file the fiscal year variable is 'fyear' while in the quarterly file the name is 'fyearq'. This name change is done to the dataframe that will be returned in RAM and not to the underlying Compustat table on DISK. The change is done to make it easier to compute the anomally characterisitcs when creating the combined CCM tables. Note _____ By use of the 'add_vars' or 'vars' keyword arguments you can query for the approximately 1000 variables that Compustat tracks. To do this you need to know the actual name of the varibale that you want to query for, paying attention to Compustat's naming conventions between their annual and quarterly files. Note _____ The defualt variables that are queried for are if the frequency given is annual: 'gvkey', 'date', 'fyear', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdb', 'dp', 'che', 'dlc', 'ceq', 'seq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'mib', 'ib' If the frequency is quarterly it is the same variables excluding 'pstkrv' and 'pstkl'. Note _____ There is less error checking in this function compared to the other methods in this class because of the large number of variables in Compustat. """ STD_VARS = None if(freq == 'A'): STD_VARS = ['gvkey', 'datadate', 'tic', 'at', 'sale', 'cogs', 'act', 'txdi', 'csho', 'lct', 'txdc', 'dpc', 'che', 'dlc', 'ceq', 'seq', 'teq', 'pstk', 'pstkrv', 'pstkl', 'txditc', 'xint', 'xsga', 'ibc', 'dltt', 'mib', 'ib', 'dp'] else: STD_VARS = ['gvkey', 'datadate', 'tic', 'atq', 'saleq', 'cogsq', 'actq', 'txdiq', 'cshoq', 'lctq', 'txdcy', 'dpcy', 'cheq', 'dlcq', 'ceqq', 'seqq', 'teqq', 'pstkq', 'txditcq', 'xintq', 'xsgaq', 'ibcy', 'dlttq', 'mibq', 'ibq', 'dpq'] DEFAULT_DTYPES = {'gvkey': str, 'ticker': str, 'at': float, 'sale': float, 'cogs': float, 'act': float, 'txdi': float, 'csho': float, 'lct': float, 'dltt': float, 'mib': float, 'txdc': float, 'dpre': float, 'che': float, 'dlc': float, 'ceq': float, 'seq': float, 'teq': float, 'pstk': float, 'txditc': float, 'xint': float, 'xsga': float, 'ibc': float, 'ib': float} CREATED_VARS = ['years_in'] ############################################################################################################################################# # Query Validation and Error Checking ############################################################################################################################################# if(freq not in ['Q', 'A']): raise Exception('Invlaid frequency given to query_compustat') # keywrods 'additional_vars' and 'vars' cannot be used simultaneously if('vars' in kwargs and 'add_vars' in kwargs): raise Exception('Keywrod Arguments \'add_vars\' and \'vars\' cannot be used simultaneously') # create list of the variables being quireied query_vars = None if('vars' in kwargs): # variable arguments to query for query_vars = kwargs['vars'] # 'permno' needs to be in the query vars for merging if('gvkey' not in query_vars): query_vars.insert(0, 'gvkey') # add date if people forgot if('datadate' not in query_vars and 'date' not in query_vars): query_vars.insert(0, 'datadate') else: if('add_vars' in kwargs): query_vars = STD_VARS + kwargs['add_vars'] else: query_vars = STD_VARS if('sub_vars' in kwargs): sub_vars = ['tic' if elem == 'ticker' else elem for elem in kwargs['sub_vars']] query_vars = [elem for elem in query_vars if elem not in sub_vars] query_vars = ['datadate' if elem == 'date' else elem for elem in query_vars] query_vars = ['tic' if elem == 'ticker' else elem for elem in query_vars] query_vars = ['conm' if elem == 'comnam' else elem for elem in query_vars] if('all_vars' in kwargs): query_vars = None indfmts = kwargs['indfmts'] if('indfmts' in kwargs) else ['INDL'] # default: Industrial, Financial datafmts = kwargs['datafmts'] if('datafmts' in kwargs) else ['STD'] # default: Standard popsrcs = kwargs['popsrcs'] if('popsrcs' in kwargs) else ['D'] # default: Consolidated consols = kwargs['consols'] if('consols' in kwargs) else ['C'] # default: Consolidated specific_query = False id_type = '' ids = [] if('id_type' in kwargs or 'ids' in kwargs): if('id_type' not in kwargs or 'ids' not in kwargs): raise Exception('When querying for a specific asset both keyword arguments \'id_type\' and \'ids\' must be specified.') if(len(kwargs['ids']) == 0): raise Exception('\'ids\' keyowrd argument given an empty list.') specific_query = True id_type = kwargs['id_type'] if(id_type == 'ticker'): id_type = 'tic' ids = kwargs['ids'] query_vars_DB = list(set(query_vars) - set(CREATED_VARS)) ############################################################################################################################################## # Load the raw data ############################################################################################################################################## # read in raw dataframe from local sql database raw_df = pd.read_sql(self._compustat_SQL_query(start_date, end_date, freq, vars = query_vars_DB, indfmt = indfmts, datafmt = datafmts, popsrc = popsrcs, consol = consols, specific_query = specific_query, id_type = id_type, ids = ids), con = self.sql_engine) ############################################################################################################################################## # Clean the raw data ############################################################################################################################################## # rename columns raw_df = raw_df.rename(columns = {'tic': 'ticker', 'conm':'comnam'}) # rename the default columns to match the names from the COMPA_FUNDA if(freq == 'Q'): # quarterly compustat # dont balme me for the different names blame compustat raw_df = raw_df.rename(columns = {'atq':'at', 'seqq':'seq', 'ceqq':'ceq', 'teqq':'teq', 'pstkq':'pstk', 'txdcy':'txdc', 'txditcq':'txditc', 'saleq':'sale', 'cogsq':'cogs', 'xintq':'xint', 'xsgaq':'xsga', 'mibq':'mib', 'ibcy':'ibc', 'txdiq':'txdi', 'dpq':'dpre', 'cshoq':'csho', 'adjex':'adjex_f', 'actq':'act', 'lctq':'lct', 'cheq':'che', 'dlcq':'dlc', 'dlttq': 'dltt', 'ibq': 'ib'}) else: # annual compustat # rename columns for consistency raw_df = raw_df.rename(columns = {'dp': 'dpre'}) # make date a datetime.date and align to the end of the year/quarter raw_df.datadate = pd.to_datetime(raw_df.datadate, format = '%Y-%m-%d') if(freq == 'A'): raw_df['year_end'] = raw_df.datadate + YearEnd(0) else: raw_df['quarter_end'] = raw_df.datadate + QuarterEnd(0) # I HATE U S&P if('fyear' in query_vars): raw_df.fyear = raw_df.fyear.astype(float) # add years in to compustat if('years_in' in query_vars): raw_df['years_in'] = raw_df.groupby(by = ['gvkey']).cumcount() # get vars in the dataframe quried_vars = list(set(list(raw_df.columns)) - set(['date'])) vars_dtypes = {} for var in quried_vars: if(var in DEFAULT_DTYPES): vars_dtypes[var] = DEFAULT_DTYPES[var] # convert dtypes raw_df = raw_df.fillna(value = np.nan) raw_df = raw_df.astype(vars_dtypes) # sort just for ease of reading raw_df = raw_df.drop_duplicates() sorting_dims = ['gvkey', 'year_end'] if(freq == 'A') else ['gvkey', 'quarter_end'] raw_df = raw_df.sort_values(by = sorting_dims) raw_df = raw_df.reset_index(drop = True) # return the dataframe return(raw_df)
def query_link_table(self) ‑> pandas.core.frame.DataFrame
-
Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to Compustat companies on permno and gvkey.
Returns
raw_df: pd.DataFrame
The raw dataframe pulled from local WRDS database.
Note
Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable.
Expand source code
def query_link_table(self) -> pd.DataFrame: """ Query the CRSP/Compustat (CCM) Merged Linking Table needed to merge CRSP securities to Compustat companies on permno and gvkey. Returns ________ raw_df: pd.DataFrame\n The raw dataframe pulled from local WRDS database. Note _____ Currently this function only works if a local copy of the WRDS database exits w/ the CCM Linktable. """ sql_str = """ SELECT gvkey, lpermno as permno, lpermco as permco, linktype, linkprim, linkdt, linkenddt FROM CRSP_CCMXPF_LINKTABLE WHERE substr(linktype, 1, 1) = 'L' AND (linkprim = 'C' or linkprim = 'P') """ # read in raw dataframe from local database raw_df = pd.read_sql(sql_str, con = self.sql_engine) # convert permno and permco to string raw_df.permco = pd.to_numeric(raw_df.permco, downcast = 'integer') raw_df.permno = pd.to_numeric(raw_df.permno, downcast = 'integer') # convert identifiers to strings raw_df.gvkey = raw_df.gvkey.astype(str) raw_df.permno = raw_df.permno.astype(str) raw_df.permco = raw_df.permco.astype(str) # if linkenddt is missing the set to todays date raw_df.linkenddt = raw_df.linkenddt.fillna(pd.to_datetime('today').date()) # convert to datetimes raw_df.linkdt = raw_df.linkdt.astype('datetime64[ns]') raw_df.linkenddt = raw_df.linkenddt.astype('datetime64[ns]') # return the raw dataframe return(raw_df)
def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) ‑> pandas.core.frame.DataFrame
-
Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate.
Parameters
start_date: datetime.date
Starting date of the dataset being queried.
end_date: datetime.date
Ending date of the dataset being queried.
obs_freq: str
The observational frequency of the CRSP database being queried. Choices are: * 'D' : daily * 'M' : monthly * 'A' : annually
Returns
full_df: pd.DataFrame
Risk-free rate data.
Note
The dataframe returned makes adjustments for NYSE holidays during compounding.
Note
List of queried CRSP variables:
* date : Date of observation * rf : Risk-free rate
Expand source code
def query_riskfree(self, start_date: datetime.date, end_date: datetime.date, obs_freq: str) -> pd.DataFrame: """ Query the risk-free rate from the Fama-French library on local WRDS. This rate is equivalent to the 1 month T-Bill rate. Parameters ___________ start_date: datetime.date\n Starting date of the dataset being queried. end_date: datetime.date\n Ending date of the dataset being queried. obs_freq: str\n The observational frequency of the CRSP database being queried. Choices are: * 'D' : daily * 'M' : monthly * 'A' : annually Returns ________ full_df: pd.DataFrame\n Risk-free rate data. Note _____ The dataframe returned makes adjustments for NYSE holidays during compounding. Note _____ List of queried CRSP variables:\n * date : Date of observation * rf : Risk-free rate """ # Since monthly observations have a date starting on the 1st of each month, then for any 'start_date' that doesn't # coincide w/ the 1st of any month, we adjust it so it does and the query pulls the monthly observation of interest. if(obs_freq in ['M', 'A'] and start_date != (start_date + MonthBegin(-1)).date()): start_date = (start_date + MonthBegin(-1)).date() # load in dataframe raw_df = pd.read_sql(self._rf1m_SQL_query(start_date, end_date, obs_freq), con = self.sql_engine) # convert dates to datetimes raw_df['date'] = pd.to_datetime(raw_df['date']) # Convert trading dates to end-of-period if 'freq' does not pertain to daily frequency. if(obs_freq == 'M'): raw_df['date'] = raw_df['date'] + MonthEnd(0) elif(obs_freq == 'A'): raw_df['date'] = raw_df['date'] + YearEnd(0) # return the raw dataframe return(raw_df)
def raw_sql(self, sql_str)
-
Allows the user to use raw SQL on the underlying database.
Note
This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database.
Expand source code
def raw_sql(self, sql_str): """ Allows the user to use raw SQL on the underlying database. Note _____ This can cause irreversible damage to the underlying database that can only be fixed by deleting and reconstructing the database. """ cprint.warn('The operation that you are about to perform might damage the local database. Do you wish to continue [y/n]:') response = input() if(response == 'y'): raw_df = pd.read_sql(sql_str, con = self.sql_engine) return(raw_df) else: cprint.info('Operation cancelled.') return(None)