asdfree
asdfree copied to clipboard
replace convey example in pnadc
IBGE just published Gini estimates using pnadc microdata. They have used different variables and filters from the used in the pnadc chapter of the book. I suggest to change it in order to match IBGE results. I believe Guilherme knows how to do that.
hi, where is the text/code that needs to change? could you take a first pass at it? thanks
Guilherme has already done it. He told me his results are matching IBGE results
On Thu, Dec 14, 2017 at 11:07 AM, Anthony Damico [email protected] wrote:
hi, where is the text/code that needs to change? could you take a first pass at it? thanks
— You are receiving this because you authored the thread. Reply to this email directly, view it on GitHub https://github.com/ajdamico/asdfree/issues/318#issuecomment-351705888, or mute the thread https://github.com/notifications/unsubscribe-auth/AFD-pz3MgP0HJ0RozKSRY3df6oTdv6z8ks5tAR2VgaJpZM4RCBxv .
Here's the code. It seems to match IBGE's publication.
# attempt to replicate numbers in
# https://biblioteca.ibge.gov.br/visualizacao/livros/liv101390.pdf
library(downloader)
# define directories
output_dir <- file.path( tempdir() , "PNADC Anual")
unlink( output_dir , recursive = TRUE )
# set uf table
uf <-
structure(list(V1 = c(11L, 12L, 13L, 14L, 15L, 16L, 17L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 31L, 32L, 33L, 35L, 41L,
42L, 43L, 50L, 51L, 52L, 53L),
V2 = structure(c(22L, 1L, 4L,
23L, 14L, 3L, 27L, 10L, 18L, 6L, 20L, 15L, 17L, 2L, 26L, 5L,
13L, 8L, 19L, 25L, 16L, 24L, 21L, 12L, 11L, 9L, 7L),
.Label = c("Acre", "Alagoas", "Amapá", "Amazonas", "Bahia", "Ceará", "Distrito Federal",
"Espírito Santo", "Goiás", "Maranhão", "Mato Grosso", "Mato Grosso do Sul",
"Minas Gerais", "Pará", "Paraíba", "Paraná", "Pernambuco", "Piauí",
"Rio de Janeiro", "Rio Grande do Norte", "Rio Grande do Sul",
"Rondônia", "Roraima", "Santa Catarina", "São Paulo", "Sergipe",
"Tocantins"), class = "factor")), .Names = c("uf", "uf_name"),
class = "data.frame", row.names = c(NA, -27L))
uf$cp_name <- c("Porto Velho" , "Rio Branco" , "Manaus", "Boa Vista", "Belém", "Macapá", "Palmas", "São Luís", "Teresina",
"Fortaleza" , "Natal", "João Pessoa" , "Recife" , "Maceió" , "Aracaju", "Salvador", "Belo Horizonte" ,
"Vitória" , "Rio de Janeiro", "São Paulo" , "Curitiba", "Florianópolis" , "Porto Alegre", "Campo Grande",
"Cuiabá", "Goiânia" , "Brasília" )
uf$rg_name <- factor( substr( uf$uf , 1 , 1 ) , levels = 1:5 , labels = c( "Norte" , "Nordeste" , "Sudeste" , "Sul" , "Centro-Oeste" ) )
# load functions
downloader::source_url( "https://raw.githubusercontent.com/guilhermejacob/guilhermejacob.github.io/master/scripts/pnadc_anual.R" , quiet = TRUE , prompt = FALSE )
# create catalog
catalog <- catalog_pnadc_anual( output_dir = output_dir )
# subset to 1st visit of 2016:
catalog <- subset( catalog , year == 2016 & visit == 1 )
# builds dataset
catalog <- build_pnadc_anual( catalog )
catalog
# build deflator table
# download and extract files
tf <- tempfile()
td <- file.path( tempdir() , "unzipped" )
download.file( "http://servicodados.ibge.gov.br/Download/Download.ashx?u=ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Trimestral/Microdados/Documentacao/Deflatores.zip" , tf )
unzip( tf , exdir = td )
list.files( td )
# read excel file
deflat_tab <- readxl::read_xls( file.path( td , "ipca_deflator_ref_mes_08_2017_pnadc.xls" ) , sheet = 2 )
# reorder columns
deflat_tab <- deflat_tab[ , rev( colnames( deflat_tab ) ) ]
# fix names
colnames( deflat_tab ) <- tolower( colnames( deflat_tab ) )
colnames( deflat_tab ) <- gsub( ".* - " , "" , colnames( deflat_tab ) )
colnames( deflat_tab ) <- gsub( ".* " , "" , colnames( deflat_tab ) )
# fix formats
deflat_tab[ , ] <- apply( deflat_tab[ , ] , 2 , as.numeric )
# prices of July/2016
this_time_ref <- deflat_tab$mes == 7 & deflat_tab$ano == 2016
deflat_tab[ , -1:-2 ] <- t( apply( deflat_tab[ , -1:-2 ] , 1 , function( x ) x / as.numeric( deflat_tab[ this_time_ref , -1:-2 ] ) ) )
# define quarter
deflat_tab$trimestre <- ceiling( deflat_tab$mes / 3 )
# subsets middle months in each quarter
deflat_tab <- deflat_tab[ deflat_tab$mes %in% (3*1:4 - 1) , ]
# melt dataset
library(reshape2)
deflat_tab <- melt( deflat_tab , id.vars = c( "ano" , "trimestre" , "mes" ) , variable.name = "deflt_area" , value.name = "deflt" )
# build survey design object
# load packages
library( survey )
library( convey )
options( survey.lonely.psu = "adjust" )
options( survey.multicore = TRUE )
# list files
these_designs <- list.files( output_dir , full.names = TRUE )
these_designs <- these_designs[ grepl( "2016" , these_designs ) ]
this_file <- these_designs[[1]]
# read dataset
pnadc_df <- readRDS( this_file )
# keep necessary column
these_columns <- c( "upa" , "estrato" , "v1031" , "v1030" , "posest" , "ano" , "trimestre" , "uf" , "capital" , "v2010" , "v2009" , "vd4016" , paste0( "vd500" , 1:6 ) , "v3002" , paste0( "vd402" , c(0,2) ) , "vd2002" )
pnadc_df <- pnadc_df[ , colnames( pnadc_df )[ colnames( pnadc_df ) %in% these_columns] ]
# merge data
pnadc_df <- merge( pnadc_df , uf , by = "uf" )
# add one column
pnadc_df$one <- 1
# create region
pnadc_df$regiao <- substr( pnadc_df$uf , 1 , 1 )
# prepare deflator
pnadc_df$deflt_area <- NULL
pnadc_df$deflt_area[ pnadc_df$regiao %in% 1 ] <- "pa"
pnadc_df$deflt_area[ pnadc_df$regiao %in% 2 ] <- "nordeste"
pnadc_df$deflt_area[ pnadc_df$uf %in% 26 ] <- "pe"
pnadc_df$deflt_area[ pnadc_df$uf %in% 29 ] <- "ba"
pnadc_df$deflt_area[ pnadc_df$uf %in% 23 ] <- "ce"
pnadc_df$deflt_area[ pnadc_df$regiao %in% 3 ] <- "sudeste"
pnadc_df$deflt_area[ pnadc_df$uf %in% 31 ] <- "mg"
pnadc_df$deflt_area[ pnadc_df$uf %in% 32 ] <- "es"
pnadc_df$deflt_area[ pnadc_df$uf %in% 33 ] <- "rj"
pnadc_df$deflt_area[ pnadc_df$uf %in% 35 ] <- "sp"
pnadc_df$deflt_area[ pnadc_df$regiao %in% 4 ] <- "sul"
pnadc_df$deflt_area[ pnadc_df$uf %in% 41 ] <- "pr"
pnadc_df$deflt_area[ pnadc_df$uf %in% 43 ] <- "rs"
pnadc_df$deflt_area[ pnadc_df$regiao %in% 5 ] <- "centro-oeste"
pnadc_df$deflt_area[ pnadc_df$uf %in% 50 ] <- "ms"
pnadc_df$deflt_area[ pnadc_df$uf %in% 52 ] <- "go"
pnadc_df$deflt_area[ pnadc_df$uf %in% 53 ] <- "df"
pnadc_df <- merge( pnadc_df , deflat_tab , by = c( "ano" , "trimestre" , "deflt_area" ) , all.x = TRUE , all.y = FALSE )
# deflate incomes
pnadc_df[ , paste0( "def_" , paste0( "vd500" , c(1:2,4:5) ) ) ] <- ( pnadc_df[ , paste0( "vd500" , c(1:2,4:5) ) ] * pnadc_df$deflt )
pnadc_df[ , paste0( "def_" , paste0( "vd402" , c(0,2) ) ) ] <- ( pnadc_df[ , paste0( "vd402" , c(0,2) ) ] * pnadc_df$deflt )
# preliminary survey design
pre_stratified <-
svydesign(
ids = ~ upa ,
strata = ~ estrato ,
weights = ~ v1031 ,
data = pnadc_df ,
nest = TRUE
)
# warning: do not use `pre_stratified` in your analyses!
# you must use the `pnadc_design` object created below.
# post-stratification targets
df_pos <- data.frame( posest = unique( pnadc_df$posest ) , Freq = unique( pnadc_df$v1030 ) )
# final survey design object
pnadc_design <- postStratify( pre_stratified , ~ posest , df_pos )
# remove the `pnadc_df` data.frame object
# and the `pre_stratified` design before stratification
rm( pnadc_df , pre_stratified , df_pos )
# applies convey
pnadc_design <- convey_prep( pnadc_design )
# estimates for per capita income
# estimate national gini index
svygini( ~def_vd5002 , pnadc_design , na.rm = TRUE )
# gini SE
# def_vd5002 0.54902 0.0059
# estimate gini indices per region
svyby( ~def_vd5002 , ~regiao , pnadc_design , svygini , na.rm = TRUE )
# regiao def_vd5002 se
# 1 1 0.5387365 0.006602036
# 2 2 0.5550321 0.006709269
# 3 3 0.5351383 0.010972824
# 4 4 0.4735002 0.005174328
# 5 5 0.5226047 0.006928894
# estimate gini indices per uf
svyby( ~def_vd5002 , ~uf , pnadc_design , svygini , na.rm = TRUE )
# uf def_vd5002 se
# 11 11 0.4779487 0.013039996
# 12 12 0.5755707 0.014961625
# 13 13 0.5718889 0.016406613
# 14 14 0.5473307 0.021664408
# 15 15 0.5311588 0.011627843
# 16 16 0.5597943 0.018165462
# 17 17 0.4981964 0.014794066
# 21 21 0.5283739 0.014989830
# 22 22 0.5455204 0.018890688
# 23 23 0.5534269 0.013670937
# 24 24 0.5575969 0.028177702
# 25 25 0.5404392 0.015861669
# 26 26 0.5783009 0.017090279
# 27 27 0.5263301 0.013028798
# 28 28 0.5716610 0.018696939
# 29 29 0.5484001 0.016016209
# 31 31 0.5036462 0.007886148
# 32 32 0.5128223 0.011484069
# 33 33 0.5242010 0.013585830
# 35 35 0.5412760 0.017064557
# 41 41 0.4853866 0.009914980
# 42 42 0.4286401 0.005914518
# 43 43 0.4858792 0.008168716
# 50 50 0.4809813 0.012016158
# 51 51 0.4569142 0.012605262
# 52 52 0.4739737 0.012410545
# 53 53 0.5830789 0.011242755
# income from all sources
# replicates
# "Índice de Gini do rendimento domiciliar per capita, a preços médios do ano"
# in file:
# ftp://ftp.ibge.gov.br/Trabalho_e_Rendimento/Pesquisa_Nacional_por_Amostra_de_Domicilios_continua/Anual/Rendimento_de_Todas_as_Fontes_2016/PNAD_Continua_2016_Rendimento_de_Todas_as_Fontes.xls
# drops pensionistas, empregados domésticos and parentes do empregado doméstico (vd2002 < 15)
sub_pnadc_design <- subset( pnadc_design , vd2002 < 15 )
# estima gini
svygini( ~def_vd4020 , sub_pnadc_design , na.rm = TRUE )
# gini SE
# def_vd4020 0.52484 0.0081
svyby( ~def_vd4020 , ~regiao , sub_pnadc_design , svygini , na.rm = TRUE )
# regiao def_vd4020 se
# 1 1 0.5170974 0.006884928
# 2 2 0.5447208 0.007090345
# 3 3 0.5196346 0.014887737
# 4 4 0.4647904 0.005158236
# 5 5 0.4930897 0.006776738
svyby( ~def_vd4020 , ~uf_name , sub_pnadc_design , svygini , na.rm = TRUE )
# uf def_vd4020 se
# 11 11 0.4468628 0.011708777
# 12 12 0.5284648 0.017992753
# 13 13 0.5370366 0.018866572
# 14 14 0.5313634 0.026873495
# 15 15 0.5274730 0.010880827
# 16 16 0.4959351 0.020879748
# 17 17 0.4748080 0.014998776
# 21 21 0.5457638 0.015895074
# 22 22 0.5917792 0.018945377
# 23 23 0.5346364 0.015800299
# 24 24 0.5491058 0.033926487
# 25 25 0.5382979 0.017969941
# 26 26 0.5508346 0.020051042
# 27 27 0.4805627 0.014571881
# 28 28 0.5603072 0.022801984
# 29 29 0.5372554 0.014348837
# 31 31 0.4858659 0.007833025
# 32 32 0.4685286 0.011995227
# 33 33 0.4842118 0.021749118
# 35 35 0.5335524 0.021987208
# 41 41 0.4679942 0.008657788
# 42 42 0.4171919 0.006819190
# 43 43 0.4893804 0.008901638
# 50 50 0.4703046 0.014011205
# 51 51 0.4403426 0.012787076
# 52 52 0.4411500 0.011654884
# 53 53 0.5531420 0.011088630
isn't there a more direct way of getting pnadc_df using lodown?
There might be. If we introduce a variable identifying the dataset type, like "special" or "quarterly", it is possible to use the same function. However, new variables such as visit number should also be introduced. What do you think?
this is up to Anthony.
hi, where is this code supposed to go?
On Thu, Dec 14, 2017 at 12:37 PM, DjalmaPessoa [email protected] wrote:
this is up to Anthony.
— You are receiving this because you commented. Reply to this email directly, view it on GitHub https://github.com/ajdamico/asdfree/issues/318#issuecomment-351782841, or mute the thread https://github.com/notifications/unsubscribe-auth/AANO50C2i-TTXDRZr6pZ6k_5ewPM2lqEks5tAVzYgaJpZM4RCBxv .
I see two possibilities:
- Replace the example in the section Poverty and Inequality Estimation with convey;
- Include in the section Replication Example. in either case, I believe you should eliminate the existing svygini example that gives results different from IBGE.