This Replication Kit details all important computations carried out for the paper
The data set used here is the German Socio-Economic Panel. We are not allowed to share the data. However, it is available, free of charge, to researchers worldwide, upon entering into a data contract, see details. The full replication code (data generation, all computations) will be deposited in the SOEP Research Data Center (SOEP-RDC), which can be accessed by authorised users with a valid SOEP data contract. |
We replicate Figure 1: Pareto QQ-plots for top wealth: Filling in and
adding tail observation, panel b. To this end, data from Manager Magazin
is appended to SOEP+P (as explained in the main text.) The imposed slope
of .6 used in the graph is obtained in the next section, i.e. invoking
beyondpareto w01110 [w=w_SOEPP], fracrange(.003,.3) rho(-0.5)
.
qui {
keep if hhrf!=. & hhrf>0
//only positive wealth
keep if w01110>0 & w01110!=.
//drop migration samples
drop if inlist(psample,17,18,19)
//exclude observation in SOEP with unreasonably high wealth
//capitalization method revealed the person cannot have more than a million in assets
drop if pid==21566601
//collapse to HH level
collapse (sum) w01110 (mean) hhrf hhrfao hhrfp psample , by(hid)
//add MM
append using "../data/MM2019"
//set MM to HH level
replace w01110=weight_MM*w01110 if MM==1
replace hhrf=1 if MM==1
replace hhrfao=1 if MM==1
//make weights sets for different samples
gen w_SOEP=hhrfao if (psample!=22 & psample!=.)
//take MM, but keep only the top 300 , Status Quo
gen w_SOEPMM=hhrfao if (psample!=22 & psample!=.) | MM==1
//Final dataset, i.e. fully filled in
gen w_all=hhrf
//renormalize weights to sum to N
foreach v of varlist w_SOEP w_SOEPMM w_all {
sum `v'
replace `v'=`v'/r(mean) if `v'!=.
}
}
* generate QQ-plot manually
local samp all
local weight w_all
keep if `weight'!=.
sort w01110, stable
gen K=_N-(_n-1)
gsort - w01110
sum `weight'
gen F=log((r(sum)+1)/(sum(`weight')))
gen double ljnw=F
gen double Xnw=log(w01110)
local thr=log(500000)
sum ljnw if w01110>exp(`thr') & MM!=1
local rngebm=r(min)
local rngebp=r(max)
sum ljnw if w01110>exp(`thr') & MM==1
local rngetm=r(min)
local rngetp=r(max)
sum ljnw if w01110>exp(`thr') & MM==1 & w_SOEPMM!=.
local rngetm=r(min)
local rngetp=r(max)
reg Xnw ljnw if MM==1 & w_SOEPMM!=. & Xnw>=`thr'
local slp2=0.956 //round(_b[ljnw],0.001) for labeling purposes, we directly enter the number
//insert optimal estimate of alpha for fit (Table 1: Top wealth in Germany in 2019, Panel C)
constraint 1 ljnw = 1/1.665
cnsreg Xnw ljnw if Xnw>=`thr' & MM!=1 , constraints(1)
gen fit=.
sum ljnw if w01110>exp(`thr') & psample==22
replace fit=(_b[_cons])+(1/1.665)*(ljnw) if Xnw>=`thr' & ljnw<=r(max)
local cnt=1
foreach q of numlist 1000000 10000000 100000000 1000000000 10000000000 100000000000{
global lab`cnt'=log(`q')
local cnt=1+`cnt'
}
global lab0=log(500000)
local titl "SOEP+SOEP-P+MM"
//full picture
tw scatter Xnw ljnw if psample==22 & Xnw>=`thr', mcolor(red) msym(oh) msize(medlarge) || ///
scatter Xnw ljnw if w_SOEP!=. & Xnw>=`thr', mcolor(black) msym(o) msize(vsmall) || ///
scatter Xnw ljnw if MM==1 & Xnw>=`thr', mcolor(lavender) msym(o) msize(vsmall) || ///
lfit fit ljnw if Xnw>=`thr', lcolor(red) || ///
lfit Xnw ljnw if Xnw>=`thr' & MM==1 , lcolor(lavender) ///
scheme(s2mono) xtitle("Relative Rank") ytitle("Net Wealth") ///
ylabel(${lab1} "1M" ${lab2} "10M" ${lab3} "100M" ${lab4} "1B" ${lab5} "10B" ${lab6} "100B") ///
legend(order(2 "SOEP" 1 "SOEP-P" 3 "MM") ring(0) pos(5) row(3)) ///
text(${lab2} 3.5 "slope= 0.601") ///
text(${lab6} 19 "slope= `slp2'") ///
graphregion(color(white)) xsize(7)
Web Appendix B, entitled Copulas and the dependence of wealth and income, considers the goodness of fit of the Gumbel copula. Section 3.1 of the main text, entitled How much do wealth and income overlap? draws on these numbers. Our implementation is in .
library(tidyverse)
library(copula)
library(latticeExtra)
library(gridExtra)
library(ggridges) ## for: geom_density_ridges
The data input can, as explained, not be shared with the user. We
consider here explicitly the case of market income
(hhmkt_inc
) and wealth (w01110
), the data
input is called soep
.
glimpse(soep)
Rows: 16,276
Columns: 6
$ w01110 <dbl> -200.00, 1512681.25, 4000.00, 221000.00, 60188.38, 4400.00, …
$ hhmkt_inc <dbl> 17914, 82514, 18096, 45935, 32000, 30639, 52408, 66490, 3240…
$ hhpst_inc <dbl> 14600, 54716, 25688, 35849, 30411, 31867, 35331, 44592, 2932…
$ hhlab_inc <dbl> 0, 79200, 8400, 0, 24000, 0, 52200, 66300, 0, 19525, 12963, …
$ hhcap_inc <dbl> 10, 4581, 0, 9989, 1874, 159, 2221, 190, 0, 1, 5, 165, 71, 1…
$ wgt <dbl> 7509.39, 453.94, 4326.58, 1419.27, 954.66, 1304.49, 5178.60,…
sel <- soep$hhmkt_inc
yl <- 'Market Income'
fn <- 'mktinc'
dat <-cbind(soep$w01110,sel,soep$wgt)
U <- matrix(NA, dim(dat)[1],2)
U[,1] <- spatstat.univar::ewcdf(dat[,1],dat[,3])(dat[,1])
U[,2] <- spatstat.univar::ewcdf(dat[,2],dat[,3])(dat[,2])
Here we reproduce Figure B.1: Contour plot of the empirical and Gumbel copula, and Pickands tail dependence function.
## the empirical copula
u <- seq(0, 1, length.out = 16)
grid <- as.matrix(expand.grid(u1 = u, u2 = u))
val <- cbind(grid, z = C.n(grid, X = U))
## the fitted copula
fg <- fitCopula(gumbelCopula(), data = U, method="irho")
gc <- gumbelCopula(iRho(gumbelCopula(), rho = rho(gumbelCopula(fg@estimate))))
gc
Gumbel copula, dim. d = 2
Dimension: 2
Parameters:
alpha = 1.715611
The following numbers constitute row 1 of Table B.1: Wealth and income: Dependence analysis.
cat("rho: ", rho(gumbelCopula(fg@estimate)), ", theta: ", fg@estimate, ", lambda_u: ", lambda(gc)[2], "\n")
rho: 0.5830379 , theta: 1.715559 , lambda_u: 0.5021608
\(\rho\) is Spearman’s empirical rank correlation coefficient between wealth and income. The estimate of the Gumbel copula parameter \(\theta\) is obtained by inverting the theoretical mapping \(\rho(\theta)\). \(\lambda_u\) is the upper-tail dependence measure, \(\lambda_u = lim_{q \to 1} \Pr\{W > F_W ^{-1}(q) | Y > F_Y ^{-1}(q) \}\) which in the Gumbel case is simply \(2-2^{1/ \theta}\).
## contour CDF (Figure B.1 panel 1)
cpCn <- contourplot2(val, region = FALSE, labels = FALSE, xlab="Wealth",ylab=yl,
col = "blue", lty=2, lwd=2,
cuts=9, xlim = c(0.25,1), ylim= c(0.25,1))
cpgC_MM <- contourplot2(gc, FUN = pCopula, lwd = 1, region = FALSE,col="red")
plot(cpCn+cpgC_MM)
This is the contour plot of the joint distribution of wealth and market income, using the empirical copula (blue dashed line) and the method-of-moments fitted Gumbel copula (red dashed line).
## Pickand (Figure B.1 panel 2)
blub <- plot(1,xlim = c(0, 1), ylim = c(0.5, 1),
type = "n", xlab = "t", ylab ="A")
curve(A(gc, w = x), from = 0, to = 1, ylim = c(0.5 ,1),
col = 1, lwd = 2, add=TRUE)
lines(c(0, 1), c(1, 1), lty = 2)
lines(c(0, 0.5, 1), c(1, 0.5, 1), lty = 2)
curve(An.biv(U, x), col="red", add=TRUE)
This is the plot of Pickand’s dependence function, using an empirical estimate (red line) and the Gumbel model implied function \(A_\theta = (t^\theta + (1-t)^\theta)^{1/\theta}\) for \((t\in [0,1])\).
Here we reproduce Figure B.3: Survival copula for capital and market income, panel b.
#surviv cop
psh <- paste("psh", fn,sep="")
assign(psh, pCopula(cbind((1:9)/10,(1:9)/10), rotCopula(gc)))
#contour option
levels <- pCopula(cbind((1:9)/10,(1:9)/10), rotCopula(gc))
p1 <- contourplot2(rotCopula(gc), FUN = pCopula, lwd = 1, region = FALSE, add = TRUE,
col="black", lty=1,
ylab="Top Wealth Percentiles",xlab="Top Income Percentiles",main="",
at=round(levels, digits=3)) ## true survival CDF
x <- p1$panel.args.common$x[1:26]
y <- p1$panel.args.common$x[1:26]
z <- matrix(p1$panel.args.common$z,length(x),length(y), byrow=TRUE)
contour(x = x, y = y, z = z, levels=round(levels, digits=3),
ylab="Top Wealth Percentiles",xlab="Top Income Percentiles", main="")
abline(coef=c(0,1), col="grey", lty=2)
Here we reproduce Figure B.5: Ridge plots for wealth and income, panel b.
Rankdat <- tibble(expand.grid(u1=seq(0.1,1,.010),u2=seq(0.1,.9,.10)))
Rankdat$dens <- dCopula(cbind(Rankdat$u1,Rankdat$u2), gc)
ggplot(Rankdat, aes(x = u1, y=factor(u2), height = dens)) +
geom_density_ridges(stat = "identity") +
labs(x="Income Percentile Rank", y="Wealth Decile",title = "") +
theme_classic() +
theme(panel.background = element_rect(fill = NA),
panel.grid.major.y = element_line(colour = "grey50"))
Finally, we following code replicates Table 3: Wealth and income: The joint top shares, row one for market income (MktInc) and relating to the population shares and income. The method of calculation is explained in Web Appendix B.2. Wealth and income: Shares for the joint distribution.
library(cubature)
sel <- soep$hhmkt_inc
# use results reported in Table 2
gam <- 1/2.771766
ymin <- 82339 # minimum income in pareto tail
pmin <- .8501482 # lowest percentile after which the top income tail starts
dat <- cbind(soep$w01110,sel,soep$wgt)
est_res <- matrix(NA, 1,4)
rownames(est_res) <- c("MktInc")
i <- 1
# pop shares
integrand1 <- function(x) {dCopula(c(x[1],x[2]),gc)}
pop1 <- hcubature(integrand1,lowerLimit = c(0.9,0.9), upperLimit = c(1,1))
pop2 <- hcubature(integrand1,lowerLimit = c(0.99,0.99), upperLimit = c(1,1))
est_res[i,1] = 100*(pop1$integral)
est_res[i,2] = 100*(pop2$integral)
## compute the cond. expected value for income conditional on wealth and income
integrand2 <- function(x) {dCopula(c(x[1],x[2]),gc)*ymin*((1-x[2])/(1-pmin))^(-gam)}
exv2 <- hcubature(integrand2,lowerLimit = c(0.9,0.9), upperLimit = c(1,1))
exv3 <- hcubature(integrand2,lowerLimit = c(0.99,0.99), upperLimit = c(1,1))
exv4 <- hcubature(integrand2,lowerLimit = c(0,0), upperLimit = c(1,1))
est_res[i,3] = 100*(exv2$integral/exv4$integral)
est_res[i,4] = 100*(exv3$integral/exv4$integral)
print(est_res, digits=3)
[,1] [,2] [,3] [,4]
MktInc 5.4 0.506 14.3 3.12
We proceed to reproduce Table 4: The rich: Descriptors. The
implementation is in stata
.
Since the table is composed of several sections, it is generated and stored as latex snippets section-by-section and row-wise for each group, before being assembled as a full latex table. The generation of the first panel (demographics) is as follows:
mat R=J(6,9,.)
matrix rownames R = "Age" "Female" "SchoolYrs" "Married" "NumChildren" "East\_Soc"
local cnt=1
qui foreach var of varlist age fem yos married nchild east_soc {
preserve
if "`var'"=="`hrs'" drop if hrs==.
/* means for bottom 90% */
mean `var' [aw=round(hhrf)] if w_grps==1
mat R[`cnt',7]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==1
mat R[`cnt',8]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==1
mat R[`cnt',9]=e(b)[1,1]
/* means for top 10-1% */
mean `var' [aw=round(hhrf)] if w_grps==2
mat R[`cnt',4]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==2
mat R[`cnt',5]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==2
mat R[`cnt',6]=e(b)[1,1]
/* means for top 1% */
mean `var' [aw=round(hhrf)] if w_grps>=3
mat R[`cnt',1]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps>=3
mat R[`cnt',2]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps>=3
mat R[`cnt',3]=e(b)[1,1]
local cnt=1+`cnt'
restore
}
/* estout matrix(R, fmt(2 2 2 2 2 2 2 2 2)) using "tables\tab7a.tex", style(tex) replace collabels(none) mlabels("",none) */
esttab matrix(R, fmt(2 2 2 2 2 2 2 2 2)), mlabels("",none) modelwidth(7)
c1 c2 c3 c4 c5 c6 c7 c8 c9
------------------------------------------------------------------------------------
Age 60.15 54.03 55.75 62.95 51.54 55.43 56.19 57.42 56.90
Female 0.26 0.31 0.22 0.36 0.39 0.28 0.53 0.53 0.52
SchoolYrs 14.33 15.40 15.02 13.82 14.60 14.81 12.29 12.20 12.36
Married 0.67 0.82 0.74 0.67 0.79 0.82 0.43 0.42 0.45
NumChildren 0.40 0.59 0.49 0.30 0.56 0.56 0.31 0.28 0.30
East\_Soc 0.03 0.09 0.03 0.06 0.11 0.07 0.20 0.20 0.19
------------------------------------------------------------------------------------
The other panels of the table follow a similar generative scheme.
//tabseg for Inheritance
mat R=J(4,9,.)
local cnt=1
gen heir=0
replace heir=1 if (I1k_firm>0 & !mi(I1k_firm) | I1k_other>0 & !mi(I1k_other))
gen heir_firm=0
replace heir_firm=1 if (I1k_firm>0 & !mi(I1k_firm))
qui foreach var of varlist heir heir_firm I1k_firm I1k_other {
preserve
if (`var'==I1k_firm|`var'==I1k_other) keep if heir==1
//means for bottom 90%
mean `var' [aw=round(hhrf)] if w_grps==1
mat R[`cnt',7]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==1
mat R[`cnt',8]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==1
mat R[`cnt',9]=e(b)[1,1]
//means for top 10-1%
mean `var' [aw=round(hhrf)] if w_grps==2
mat R[`cnt',4]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==2
mat R[`cnt',5]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==2
mat R[`cnt',6]=e(b)[1,1]
//means for top 1%
mean `var' [aw=round(hhrf)] if w_grps>=3
mat R[`cnt',1]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps>=3
mat R[`cnt',2]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps>=3
mat R[`cnt',3]=e(b)[1,1]
local cnt=1+`cnt'
restore
}
matrix rownames R = "Heir" "Heir\_firm" "I1k\_firm" "I1k\_other"
estout matrix(R, fmt(2 2 2 2 2 2 2 2 2)) using "tables\tab7b.tex", style(tex) replace collabels(none) mlabels("",none)
//tabseg for labor market
mat R=J(8,9,.)
local cnt=1
qui foreach var of varlist selfempl top_manag civserv pgexpft pgexppt pgexpue hhlab_inc hhcap_inc {
preserve
//means for bottom 90%
mean `var' [aw=round(hhrf)] if w_grps==1
mat R[`cnt',7]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==1
mat R[`cnt',8]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==1
mat R[`cnt',9]=e(b)[1,1]
//means for top 10-1%
mean `var' [aw=round(hhrf)] if w_grps==2
mat R[`cnt',4]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==2
mat R[`cnt',5]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==2
mat R[`cnt',6]=e(b)[1,1]
//means for top 1%
mean `var' [aw=round(hhrf)] if w_grps>=3
mat R[`cnt',1]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps>=3
mat R[`cnt',2]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps>=3
mat R[`cnt',3]=e(b)[1,1]
local cnt=1+`cnt'
restore
}
matrix rownames R = "SelfEmp" "TopManag" "CivServ" "ExpFT" "ExpPT" "ExpUE" "LabInc" "CapInc"
estout matrix(R, fmt(2 2 2 2 2 2 2 2 2)) using "tables\tab7c.tex", style(tex) replace collabels(none) mlabels("",none)
//tabseg for personality
mat R=J(6,9,.)
local cnt=1
qui foreach var of varlist risk_tol zopentopw zconstopw zextratopw zagretopw zneurotopw {
preserve
//means for bottom 90%
mean `var' [aw=round(hhrf)] if w_grps==1
mat R[`cnt',7]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==1
mat R[`cnt',8]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==1
mat R[`cnt',9]=e(b)[1,1]
//means for top 10-1%
mean `var' [aw=round(hhrf)] if w_grps==2
mat R[`cnt',4]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps==2
mat R[`cnt',5]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps==2
mat R[`cnt',6]=e(b)[1,1]
//means for top 1%
mean `var' [aw=round(hhrf)] if w_grps>=3
mat R[`cnt',1]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if y_grps>=3
mat R[`cnt',2]=e(b)[1,1]
mean `var' [aw=round(hhrf)] if yw_grps>=3
mat R[`cnt',3]=e(b)[1,1]
local cnt=1+`cnt'
restore
}
matrix rownames R = "Risk\_Tol" "B5\_Open" "B5\_Cons" "B5\_Extra" "B5\_Agree" "B5\_Neuro"
estout matrix(R, fmt(2 2 2 2 2 2 2 2 2)) using "tables\tab7d.tex", style(tex) replace collabels(none) mlabels("",none)
//tabseg for N
mat R=J(1,9,.)
local cnt=1
foreach var of varlist age {
preserve
//means for bottom 90%
mean `var' if w_grps==1
mat R[`cnt',7]=e(_N)[1,1]
mean `var' if y_grps==1
mat R[`cnt',8]=e(_N)[1,1]
mean `var' if yw_grps==1
mat R[`cnt',9]=e(_N)[1,1]
//means for top 10-1%
mean `var' if w_grps==2
mat R[`cnt',4]=e(_N)[1,1]
mean `var' if y_grps==2
mat R[`cnt',5]=e(_N)[1,1]
mean `var' if yw_grps==2
mat R[`cnt',6]=e(_N)[1,1]
//means for top 1%
mean `var' if w_grps>=3
mat R[`cnt',1]=e(_N)[1,1]
mean `var' if y_grps>=3
mat R[`cnt',2]=e(_N)[1,1]
mean `var' if yw_grps>=3
mat R[`cnt',3]=e(_N)[1,1]
local cnt=1+`cnt'
restore
}
matrix rownames R = "\emph{N}"
estout matrix(R, fmt(0)) using "tables\tab7e.tex", style(tex) replace collabels(none) mlabels("",none)
c1 c2 c3 c4 c5 c6 c7 c8 c9
------------------------------------------------------------------------------------
Heir 0.48 0.44 0.40 0.43 0.27 0.41 0.19 0.21 0.21
Heir\_firm 0.21 0.11 0.30 0.04 0.04 0.06 0.01 0.01 0.01
I1k\_firm 719.49 606.52 1945.80 18.14 35.97 62.90 1.28 5.04 5.02
I1k\_other 717.64 391.99 726.48 254.81 183.46 307.31 96.99 127.89 124.60
------------------------------------------------------------------------------------
Our implementation is in .
Our new wrangled data set for this section, entitled
soep
, is as follows
glimpse(soep)
Rows: 15,210
Columns: 24
$ d_yt1 <fct> Not Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not …
$ d_yt10 <fct> Not Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not …
$ d_wt1 <fct> Not Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not …
$ d_wt10 <fct> Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not Top,…
$ d_ywt1 <fct> Not Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not …
$ d_ywt10 <fct> Not Top, Not Top, Not Top, Not Top, Not Top, Not Top, Not …
$ Female <fct> 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0…
$ Age <dbl> 61, 79, 73, 64, 85, 56, 59, 81, 57, 60, 66, 82, 83, 71, 71…
$ SelfEmp <fct> self-emp, not self-emp, not self-emp, not self-emp, not se…
$ SchoolYrs <dbl> 18.0, 10.5, 18.0, 11.0, 11.0, 18.0, 13.0, 10.5, 10.0, 9.0,…
$ I1k_firm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ I1k_other <dbl> 512.502991, 10.475771, 321.613129, 16.066008, 0.000000, 0.…
$ ExpFT <dbl+lbl> 30.667, 36.417, 37.583, 18.167, 45.833, 32.000, 6.667…
$ ExpUE <dbl+lbl> 0.000, 2.250, 2.000, 5.500, 4.000, 0.083, 0.000…
$ ExpPT <dbl+lbl> 0.000, 0.000, 0.500, 0.000, 0.500, 7.917, 16.083…
$ east_soc <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ NumChildren <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ Married <fct> 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1…
$ B5_Open <dbl> -0.49369127, -1.59756613, -0.49369127, -1.04562879, 0.3342…
$ B5_Cons <dbl> -0.4858003, -0.1288104, -0.8427907, 0.2281801, -1.9137614,…
$ B5_Extra <dbl> -0.79573333, -1.08445716, -0.50700933, 0.07043821, -1.0844…
$ B5_Agree <dbl> -0.08343563, -2.13356113, 0.25825158, 0.59993929, 0.599939…
$ B5_Neuro <dbl> 0.85616058, 1.90400827, -0.19168693, -0.19168693, -0.97757…
$ Risk_Tol <dbl+lbl> 2, 5, 7, 5, 6, 2, 3, 6, 5, 2, 2, 3, 5, 7, 4, 4, 1, 6, …
d_wt1
indicates whether an observation is part of the
top 1% of wealth, similarly d_yt1
relates to the top1% for
market income, and d_ywt1
refers to the joint top 1%. The
predictors are combined in the following formula
modf
y ~ Female + Age + SelfEmp + SchoolYrs + I1k_firm + I1k_other +
ExpFT + ExpUE + ExpPT + east_soc + NumChildren + Married +
B5_Open + B5_Cons + B5_Extra + B5_Agree + B5_Neuro + Risk_Tol
The code below is applied to being in the top 1% wealth group
(d_wt1
):
soep$y <- soep$d_wt1
The following code reproduces Figure 2 panel b, the pruned classification tree for being in the top1% group of wealth.
library(rpart)
library(rpart.plot)
tree_wt1 <- rpart(modf, data = soep, method="class",
control = rpart.control(cp = 0,xval=10,minsplit = 50))
tree_wt1 <- prune(tree_wt1, cp=tree_wt1$cptable[3,"CP"])
rpart.plot(tree_wt1,
type = 4,
clip.right.labs = FALSE,
extra = 104,
fallen.leaves = TRUE,
box.palette = "GyRd",
cex.main = 1.5,
compress = FALSE,
ycompress = FALSE,
branch.lwd = 4
)
The random forest is estimated using ranger
, and the
\(m_{try}\) tuning parameter is set
using 10-fold cross-validation using the AUC metric.
library(rsample)
library(ranger)
cores <- parallel::detectCores()
rf_model <- function(splits,mtry, modf) {
rf.fit <- ranger(modf, data=analysis(splits), mtry=mtry,
num.threads = cores,
importance="impurity_corrected", probability = TRUE,
classification =TRUE )
holdout <- assessment(splits)
temp <- predict(rf.fit, holdout, type = "response")
holdout$probs <- temp$predictions[,2]
holdout$class <- ifelse(holdout$probs > 0.5, 1, 0)
holdout$class <- factor(holdout$class,levels=c(0,1),labels = c("Not Top","Top"))
holdout <- holdout %>% select(y, probs, class)
holdout
}
## set.seed(12345)
set.seed(9738)
splits <- initial_split(soep, strata = y)
train_data <- training(splits)
test_data <- testing(splits)
folds <- vfold_cv(train_data, v = 10, strata = y)
mtry_seq <- 2:10
results <- matrix(NA, length(mtry_seq),3,
dimnames=list(NULL, c("mtry", "accuracy", "auc")))
for (i in 1:length(mtry_seq)) {
results[i,1] <- mtry_seq[i]
folds$results <- purrr::map(folds$splits,rf_model,mtry=mtry_seq[i], modf=modf)
folds$accuracy <- purrr::map_dbl(folds$results,
function(x) mean(x$y == x$class))
folds$AUC <- purrr::map_dbl(folds$results,
function(x) yardstick::roc_auc(x,truth = y,probs,
event_level = "second")$.estimate)
results[i,2] <- mean(folds$accuracy)
results[i,3] <- mean(folds$AUC)
}
results <- as_tibble(results)
best_tune_wt1 <- results[which.max(results$auc),]
best_tune_wt1
# A tibble: 1 × 3
mtry accuracy auc
<dbl> <dbl> <dbl>
1 2 0.957 0.897
The optimal model is re-estimated on the complete data set.
## re-estimate with best tuning paramter, and return also variable importance scores
rf.fit_wt1 <- ranger(modf, data=soep, mtry=best_tune_wt1$mtry, num.threads = cores,
importance="impurity_corrected",probability = TRUE,classification =TRUE)
The following code replicates Figure 4: Key predictors: Variable importance scores (VIMPs) for top 1 percent group membership, panel 1. The resulting figure is not identical to the published figure since the seeds for the randomisations are different, but the two a close enough.
vi_RF_wt1 <- tibble(variable=names(rf.fit_wt1 $variable.importance),
score = rf.fit_wt1 $variable.importance) %>%
arrange(desc(score)) %>%
mutate(variable=factor(variable, levels=variable[order(score)]))
n_vi <- 6 # number of variables plotted in VIPs
ggplot(vi_RF_wt1[1:n_vi,], aes(x=variable, y=score )) +
geom_bar(stat = "identity",width=0.6) +
coord_flip() +
theme_bw() +
theme(aspect.ratio = 0.7/1) +
ylab("Importance Score") +
xlab("") +
theme(axis.line = element_line(colour = "black"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(size = 13,angle = 45))
# Forest was grown with 'impurity_corrected' variable importance.
# For prediction it is advised to grow another forest without this importance setting.
# rf.fit_wt1 <- ranger(modf, data=soep, mtry=best_tune_wt1$mtry,
# num.threads = cores,
# probability = TRUE,classification =TRUE)
# ROC and AUC
temp.rf <- predict(rf.fit_wt1, soep, type = "response")
soep$pr_rf <- temp.rf$predictions[,2]
roc_rf_wt1 <- yardstick::roc_curve(soep,y,pr_rf, event_level = "second") %>% mutate(model = "Random Forest")
auc_rf_wt1 <- round(yardstick::roc_auc(soep,truth = y,pr_rf, event_level = "second")$.estimate,digits=3)
auc_rf_wt1
[1] 0.947
glm_wt1 <- glm(modf, data = soep, family = "binomial")
# back-trim the empirical model in a stepwise procedure
# this takes a few seconds ...
glm_wt1_backward <- step(glm_wt1, direction = "backward", trace = 0)
# ROC and AUC
temp.glm <- predict(glm_wt1, soep, type = "response")
soep$pr_glm <- temp.glm
roc_glm_wt1 <- yardstick::roc_curve(soep,y,pr_glm, event_level = "second") %>% mutate(model = "Logit")
auc_glm_wt1 <- round(yardstick::roc_auc(soep,truth = y,pr_glm, event_level = "second")$.estimate,digits=3)
auc_glm_wt1
[1] 0.901
The following replicates Web Appendix Figure G.2: Logits: Top variable importance scores, panel 4.
caret::varImp(glm_wt1_backward) %>%
arrange(desc(Overall)) ->
temp
vi_glm_wt1 <- tibble(variable=rownames(temp),
score = temp$Overall) %>%
mutate(variable=factor(variable, levels=variable[order(score)]))
ggplot(vi_glm_wt1[1:n_vi,], aes(x=variable, y=score )) +
geom_bar(stat = "identity") +
coord_flip() +
theme_bw() +
ylab("Importance Score") + xlab("") +
theme(axis.line = element_line(colour = "black"),
panel.grid.major.y = element_blank(),
panel.grid.minor.y = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(size = 13,angle = 45))
The following replicates Figure 3: ROC curve for top 1 percent of wealth (RFs vs. logits), modulo the randomness.
bind_rows(roc_rf_wt1, roc_glm_wt1) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
theme_bw() +
scale_colour_manual(name="Models",values=c("Random Forest"="red","Logit"="blue")) +
theme(legend.position = "top") +
annotate("text", x = 0.80, y=0.1, label = paste("AUC for RF:", auc_rf_wt1)) +
annotate("text", x = 0.80, y=0.05, label = paste("AUC for Logit:", auc_glm_wt1))
The replication of Table 5: A confusion matrices for top 1 percent wealth group is as follows:
soep$d_check <- as.factor(ifelse(soep$d_wt1 =="Top",1,0))
temp.glm <- predict(glm_wt1, soep, type = "response")
soep$pr_glm <- as.factor(ifelse(temp.glm>=0.24,1,0))
CM_GLM<-caret::confusionMatrix(soep$d_check,soep$pr_glm)$table
CM_GLM<-round(100*CM_GLM/sum(!is.na(soep$d_check)),2)
CM_GLM
Reference
Prediction 0 1
0 92.93 2.87
1 2.47 1.74
temp.rf <- predict(rf.fit_wt1, soep, type = "response")
Warning in predict.ranger(rf.fit_wt1, soep, type = "response"): Forest was
grown with 'impurity_corrected' variable importance. For prediction it is
advised to grow another forest without this importance setting.
soep$pr_rf <- as.factor(ifelse(temp.rf$predictions[,2]>=0.24,1,0))
CM_RF <- caret::confusionMatrix(soep$d_check,soep$pr_rf)$table
CM_RF <- round(100*CM_RF/sum(!is.na(soep$d_check)),2)
CM_RF
Reference
Prediction 0 1
0 94.77 1.02
1 2.56 1.64
Again, the RF results are not identical because of the different seed for the randomisations.
These are considered in Web Appendix E.1.1. Partial Dependence Plots (PDPs). Here we reproduce Figure E.1: Partial dependence plots for top 1 percent: Inheritances and self-employment for the top 1% wealth group. since the plot is based on the RF estimate, the plot here differs slightly from the published version.
library(pdp)
temp <- soep %>% select(all_of(c(V1set,V2set,V3set))) %>% lapply(as.numeric) %>% data.frame
temp$SelfEmp <- factor(temp$SelfEmp,levels=c(1,2),labels = c("not self-emp","self-emp"))
#prediction function
predf <- function(object,newdata) {
mean(predict(object, data=newdata)$predictions[,2])
}
# this takes 2 minutes
wpdp <- pdp::partial(rf.fit_wt1, pred.var = c("SelfEmp","I1k_firm"), pred.fun=predf,
plot = FALSE,grid.resolution = 500,parallel = T)
wSE <- data.frame(wpdp) %>%
filter(SelfEmp=="self-emp") %>%
mutate(yhat=yhat-mean(as.numeric(soep$d_wt1)-1))
wNSE <- as.data.frame(wpdp) %>%
filter(SelfEmp=="not self-emp") %>%
mutate(yhat=yhat-mean(as.numeric(soep$d_wt1)-1))
pdp.se <- ggplot(data.frame(wSE), aes(y=yhat,x=I1k_firm)) +
geom_line(aes(color="W"), lwd=1, lty=6) +
coord_cartesian(ylim = c(0, .4), xlim = c(0,6000)) +
geom_vline(xintercept = 2500,lwd=1,color="black",alpha=0.3, lty=2) +
annotate(geom="text",x=2650, label="2.5M", y=0.05, color="black", angle=270, size=6) +
xlab("Intergen. Transfer in 1000: Firm") + ylab("Change in Predicted Probability") +
ggtitle("Top 1%") +
theme_bw() + theme(text=element_text(size=17)) + theme(legend.position="top") +
# geom_line(data=data.frame(ySE), aes(color="I"), lty=1, lwd=1) +
# geom_line(data=data.frame(ywSE), aes(color="W+I"), lty=5, lwd=1) +
scale_color_manual(name='Rich groups:',
breaks=c('W', 'I', 'W+I'),
values=c('W'='red', 'I'='black', 'W+I'='darkorange')) +
theme(axis.line = element_line(colour = "black"),panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.border = element_blank(),panel.background = element_blank()) +
scale_y_continuous(breaks=seq(0,.4,.05))
pdp.nse <- ggplot(data.frame(wNSE), aes(y=yhat,x=I1k_firm)) +
geom_line(aes(color="W"), lwd=1, lty=6) +
coord_cartesian(ylim = c(0, .4), xlim = c(0,6000)) +
geom_vline(xintercept = 2500,lwd=1,color="black",alpha=0.3, lty=2) +
annotate(geom="text",x=2650, label="2.5M", y=0.05, color="black", angle=270, size=6) +
xlab("Intergen. Transfer in 1000: Firm") + ylab("Change in Predicted Probability") +
ggtitle("Top 1%") +
theme_bw() + theme(text=element_text(size=17)) + theme(legend.position="top") +
# geom_line(data=data.frame(yNSE), aes(color="I"), lty=1, lwd=1) +
# geom_line(data=data.frame(ywNSE), aes(color="W+I"), lty=5, lwd=1) +
scale_color_manual(name='Rich groups:',
breaks=c('W', 'I', 'W+I'),
values=c('W'='red', 'I'='black', 'W+I'='darkorange')) +
theme(axis.line = element_line(colour = "black"),panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.border = element_blank(),panel.background = element_blank()) +
scale_y_continuous(breaks=seq(0,.4,.05))
gridExtra::grid.arrange(pdp.se + ggtitle("Self-Employed"), pdp.nse + ggtitle("Not Self-Employed"),
nrow=1)
These are considered in Web Appendix E.1.2. Accumulated Local Effects (ALEs). Here we reproduce Figure E.3: Accumulated local effects for top 1 percent: Inheritances for the top 1% wealth group. Since the plot is based on the RF estimate, the plot here differs slightly from the published version.
library(ALEPlot)
predf <- function(X.model, newdata) {
predict(X.model, data=newdata)$predictions[,2]
}
modf <- formula(paste0("d_wt1 ~ ", paste(V1set, collapse="+"), "+",
paste(V2set, collapse="+"), "+",
paste(V3set, collapse="+") )
)
rf.fit_wt1 <- ranger(modf, data=soep, mtry=best_tune_wt1$mtry,
num.threads = cores,
# importance="impurity_corrected",
probability = TRUE,classification =TRUE)
temp <- soep %>% select(all_of(c(V1set,V2set,V3set)))
# ALE needs a data.frame
ALE.I1kfirm.wt1 <- ALEPlot(X = as.data.frame(temp), X.model = rf.fit_wt1,
J = grep("I1k_firm", c(V1set,V2set,V3set)),
pred.fun = predf, K=4000)
ggplot(data.frame(ALE.I1kfirm.wt1), aes(y=f.values,x=x.values)) +
geom_line(aes(color="W"), lwd=1, lty=6) +
coord_cartesian(ylim = c(0, .5), xlim = c(0,6000)) +
geom_vline(xintercept = 2500,lwd=1,color="black",alpha=0.3, lty=2) +
annotate(geom="text",x=2650, label="2.5M", y=0.05, color="black", angle=270, size=6) +
xlab("Intergen. Transfer in 1000: Firm") + ylab("Accumulated Local Effect") +
ggtitle("Top 1%") +
theme_bw() + theme(text=element_text(size=17)) + theme(legend.position="top") +
# geom_line(data=data.frame(ALE.I1kfirm.yt1), aes(color="I"), lty=1, lwd=1) +
# geom_line(data=data.frame(ALE.I1kfirm.ywt1), aes(color="W+I"), lty=5, lwd=1) +
scale_color_manual(name='Rich groups:',
breaks=c('W', 'I', 'W+I'),
values=c('W'='red', 'I'='black', 'W+I'='darkorange')) +
theme(axis.line = element_line(colour = "black"),panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(), panel.grid.minor.y = element_blank(),
panel.border = element_blank(),panel.background = element_blank()) +
scale_y_continuous(breaks=seq(0,.4,.05))
Figure 5: Gross wealth shares by asset class and rich groups has been generated as follows.
use "${SOEP_IN}\SC19_final.dta", clear
keep if hhrf!=. & hhrf>0
//set seed
set seed 1234
//keep M-samples with zero wealth
replace w01110=0 if inlist(psample,17,18,19)
ren hhpressi_inc hhmkt_inc
//collapse to HH level
collapse (sum) w01110 p01010 e01010 l01000 f01010 h01000 b01000 v01000 t01000 ///
(mean) hhmkt_inc hhpre_inc hhpst_inc hhlab_inc hhcap_inc hhrf hhrfao hhrfp psample , by(hid)
//make weights sets for different samples
gen w_SOEP=hhrfao if psample<22
//Final dataset, i.e. fully filled in
gen w_all=hhrf
//renormalize weights to sum to N
foreach v of varlist w_SOEP w_all {
sum `v'
replace `v'=`v'/r(mean) if `v'!=.
}
//generate wealth, income and wealth+income groups to merge in with individual level data
//wealth groups
sum w01110 [w=hhrf], d
capture drop w_grps
gen w_grps=1
replace w_grps=2 if w01110>=r(p90) & w01110!=.
replace w_grps=3 if w01110>=r(p99) & w01110!=.
gen hh_netw=w01110
//income groups
capture drp y_grps
sum hhmkt_inc [w=hhrf], d
gen y_grps=1
replace y_grps=2 if hhmkt_inc>=r(p90) & w01110!=.
replace y_grps=3 if hhmkt_inc>=r(p99) & w01110!=.
//wealth and income groups
gen yw_grps=1
replace yw_grps=2 if w_grps>=2 & y_grps>=2
replace yw_grps=3 if w_grps==3 & y_grps==3
// top 10% income
gen byte d_yt10=0
replace d_yt10=1 if y_grps==2
// top 10% wealth
gen byte d_wt10=0
replace d_wt10=1 if w_grps==2
// top 10% w+i
gen byte d_ywt10=0
replace d_ywt10=1 if yw_grps==2
//in top 1% of income
gen byte d_yt1=0
replace d_yt1=1 if y_grps==3
//in top 1% of wealth
gen byte d_wt1=0
replace d_wt1=1 if w_grps==3
//in top 1% of wealth
gen byte d_ywt1=0
replace d_ywt1=1 if yw_grps==3
//build new gross wealth categories
//real estate
egen realW=rowtotal(p01010 e01010)
replace realW=0 if realW==.
//closely held firms
gen busiW=0
replace busiW=b01000 if !mi(b01000) & b01000>0
//financial assets
egen finW=rowtotal(l01000 f01010 h01000)
replace finW=0 if finW==.
//tangibles
egen tangW=rowtotal(v01000 t01000)
replace tangW=0 if tangW==.
//gross
egen grossW=rowtotal(p01010 e01010 l01000 f01010 h01000 b01000 v01000 t01000)
//collapse over all groups separately
foreach v of varlist d_yt1 d_wt1 d_ywt1 d_yt10 d_wt10 d_ywt10{
preserve
drop if `v'==0
collapse (sum) realW busiW finW tangW grossW [aw=hhrf] , by(`v')
gen sh_real=100*(realW/grossW)
gen sh_busi=100*(busiW/grossW)
gen sh_tang=100*(tangW/grossW)
gen sh_fin =100*(finW/grossW)
tempfile dat_`v'
save `dat_`v'',replace
restore
}
preserve
use `dat_d_yt1', clear
foreach v of newlist d_wt1 d_ywt1 d_yt10 d_wt10 d_ywt10 {
append using `dat_`v''
}
gen stgrps=1 if d_wt10==1
replace stgrps=2 if d_yt10==1
replace stgrps=3 if d_ywt10==1
//top 1
replace stgrps=4 if d_wt1==1
replace stgrps=5 if d_yt1==1
replace stgrps=6 if d_ywt1==1
//graph stacked bars
graph bar sh_tang sh_fin sh_real sh_busi , stack ///
over(stgrps, relabel(1 `" "W" " " "' 2 `" "I" "Top 10-1%" "' 3 `" "I+W" " " "' 4 `" "W" " " "' 5 `" "I" "Top 1%" "' 6 `" "I+W" " " "')) ///
blabel(bar,position(center) format(%9.2f)) ///
legend(order(1 "Tangibles" 2 "Financial" 3 "Real Estate" 4 "Firms" ) pos(12) row(1) nobox region(lstyle(none)) ) xsize(6) ///
scheme(s2color) graphregion(color(white)) ///
bar(3, lc(orange) fc(orange%50) ) bar(4, lcolor(red) fc(red%50)) ///
bar(2, lc(sienna) fc(sienna%50)) bar(1, lc(black) fc(black%50))
graph export "${RES_PATH}\graphs\ShofGW.png", replace
library(sessioninfo)
platform_info()
setting value
version R version 4.3.2 (2023-10-31)
os macOS Sonoma 14.1
system aarch64, darwin20
ui X11
language (EN)
collate en_US.UTF-8
ctype en_US.UTF-8
tz Europe/Paris
date 2024-09-05
pandoc 3.1.1 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
package_info(c("ALEPlot", "copula", "ranger", "rpart", "rpart.plot", "rsample", "pdp"),
dependencies=FALSE)
package * version date (UTC) lib source
ALEPlot * 1.1 2018-05-24 [1] CRAN (R 4.3.0)
copula * 1.1-4 2024-08-17 [1] CRAN (R 4.3.3)
pdp * 0.8.1 2022-06-07 [1] CRAN (R 4.3.0)
ranger * 0.16.0 2023-11-12 [1] CRAN (R 4.3.1)
rpart * 4.1.21 2023-10-09 [1] CRAN (R 4.3.2)
rpart.plot * 3.1.2 2024-02-26 [1] CRAN (R 4.3.1)
rsample * 1.2.1 2024-03-25 [1] CRAN (R 4.3.1)
[1] /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library