StataQuickReference
StataQuickReference copied to clipboard
Add 'loop over var names' Stata to Pandas crosswalk
Some example code...
>>> vars
['yr2000', 'yr2001', 'yr2002', 'yr2003', 'yr2004', 'yr2005', 'yr2006', 'yr2007', 'yr2008', 'yr2009', 'yr2010', 'yr2011', 'yr2012', 'yr2013', 'yr2014', 'yr2015', 'yr2016', 'yr2017', 'yr2018', 'yr2019']
>>>
Also
>>> for i in range(2013,2019):
... df_dict[i] = pd.read_stata('list_of_caids_numsub_{}.dta'.format(i))
...
>>> df_dict[2014].columns
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
>>> for i in range(2013, 2019):
... df_dict[i].columns
...
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
>>> for i in range(2013, 2019):
... df_dict[i].columns = [c if c == 'ApplicantId' else c + str(i) for c in df_dict[i].columns]
...
>>> for i in range(2013, 2019):
... df_dict[i].columns
...
Index(['ApplicantId', 'numSub2013', 'didSub2013', 'sessyr2013'], dtype='object')
Index(['ApplicantId', 'numSub2014', 'didSub2014', 'sessyr2014'], dtype='object')
Index(['ApplicantId', 'numSub2015', 'didSub2015', 'sessyr2015'], dtype='object')
Index(['ApplicantId', 'numSub2016', 'didSub2016', 'sessyr2016'], dtype='object')
Index(['ApplicantId', 'numSub2017', 'didSub2017', 'sessyr2017'], dtype='object')
Index(['ApplicantId', 'numSub2018', 'didSub2018', 'sessyr2018'], dtype='object')
>>> df = df_dict[2013]
>>> for i in range(2014, 2019):
... df = pd.merge(df, df_dict[i], how='outer')
...
>>> df.columns
Index(['ApplicantId', 'numSub2013', 'didSub2013', 'sessyr2013', 'numSub2014',
'didSub2014', 'sessyr2014', 'numSub2015', 'didSub2015', 'sessyr2015',
'numSub2016', 'didSub2016', 'sessyr2016', 'numSub2017', 'didSub2017',
'sessyr2017', 'numSub2018', 'didSub2018', 'sessyr2018'],
dtype='object')
>>>