Add 'loop over var names' Stata to Pandas crosswalk

Open adamrossnelson opened this issue 5 years ago • 0 comments

Some example code...

>>> vars
['yr2000', 'yr2001', 'yr2002', 'yr2003', 'yr2004', 'yr2005', 'yr2006', 'yr2007', 'yr2008', 'yr2009', 'yr2010', 'yr2011', 'yr2012', 'yr2013', 'yr2014', 'yr2015', 'yr2016', 'yr2017', 'yr2018', 'yr2019']
>>>

Also

>>> for i in range(2013,2019):
...    df_dict[i] = pd.read_stata('list_of_caids_numsub_{}.dta'.format(i))
...
>>> df_dict[2014].columns
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')

>>> for i in range(2013, 2019):
...    df_dict[i].columns
...
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
Index(['ApplicantId', 'numSub', 'didSub', 'sessyr'], dtype='object')
>>> for i in range(2013, 2019):
...    df_dict[i].columns = [c if c == 'ApplicantId' else c + str(i) for c in df_dict[i].columns]
...
>>> for i in range(2013, 2019):
...    df_dict[i].columns
...
Index(['ApplicantId', 'numSub2013', 'didSub2013', 'sessyr2013'], dtype='object')
Index(['ApplicantId', 'numSub2014', 'didSub2014', 'sessyr2014'], dtype='object')
Index(['ApplicantId', 'numSub2015', 'didSub2015', 'sessyr2015'], dtype='object')
Index(['ApplicantId', 'numSub2016', 'didSub2016', 'sessyr2016'], dtype='object')
Index(['ApplicantId', 'numSub2017', 'didSub2017', 'sessyr2017'], dtype='object')
Index(['ApplicantId', 'numSub2018', 'didSub2018', 'sessyr2018'], dtype='object')
>>> df = df_dict[2013]
>>> for i in range(2014, 2019):
...    df = pd.merge(df, df_dict[i], how='outer')
...
>>> df.columns
Index(['ApplicantId', 'numSub2013', 'didSub2013', 'sessyr2013', 'numSub2014',
       'didSub2014', 'sessyr2014', 'numSub2015', 'didSub2015', 'sessyr2015',
       'numSub2016', 'didSub2016', 'sessyr2016', 'numSub2017', 'didSub2017',
       'sessyr2017', 'numSub2018', 'didSub2018', 'sessyr2018'],
      dtype='object')
>>>

Jan 16 '20 17:01 adamrossnelson