# Statistical Matching using Optimal Transport

## Introduction

In this vignette we will explain how some functions of the package are used in order to estimate a contingency table. We will work on the eusilc dataset of the laeken package. All the functions presented in the following are explained in the proposed manuscript by Raphaël Jauslin and Yves Tillé (2021) <arXiv:2105.08379>.

## Contingency table

We will estimate the contingency table when the factor variable which represents the economic status pl030 is crossed with a discretized version of the equivalized household income eqIncome. In order to discretize the equivalized income, we calculate percentiles (0.15,0.30,0.45,0.60,0.75,0.90) of the variable and define the category as intervals between the values.

library(laeken)
library(sampling)
library(StratifiedSampling)
#> Le chargement a nécessité le package : Matrix

data("eusilc")
eusilc <- na.omit(eusilc)
N <- nrow(eusilc)

# Xm are the matching variables and id are identity of the units
Xm <- eusilc[,c("hsize","db040","age","rb090","pb220a")]
Xmcat <- do.call(cbind,apply(Xm[,c(2,4,5)],MARGIN = 2,FUN = disjunctive))
Xm <- cbind(Xmcat,Xm[,-c(2,4,5)])
id <- eusilc$rb030 # categorial income splitted by the percentile c_income <- eusilc$eqIncome
q <- quantile(eusilc$eqIncome, probs = seq(0, 1, 0.15)) c_income[which(eusilc$eqIncome <= q)] <- "(0,15]"
c_income[which(q < eusilc$eqIncome & eusilc$eqIncome <= q)] <- "(15,30]"
c_income[which(q < eusilc$eqIncome & eusilc$eqIncome <= q)] <- "(30,45]"
c_income[which(q < eusilc$eqIncome & eusilc$eqIncome <= q)] <- "(45,60]"
c_income[which(q < eusilc$eqIncome & eusilc$eqIncome <= q)] <- "(60,75]"
c_income[which(q < eusilc$eqIncome & eusilc$eqIncome <= q)] <- "(75,90]"
c_income[which(  eusilc$eqIncome > q )] <- "(90,100]" # variable of interests Y <- data.frame(ecostat = eusilc$pl030)
Z <- data.frame(c_income = c_income)

# put same rownames
rownames(Xm) <- rownames(Y) <- rownames(Z)<- id

YZ <- table(cbind(Y,Z))
#>        c_income
#> ecostat (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100]   Sum
#>     1      409     616     722     807     935    1025      648  5162
#>     2      189     181     205     184     165     154       82  1160
#>     3      137      90      72      75      59      52       33   518
#>     4      210     159     103      95      74      49       46   736
#>     5      470     462     492     477     459     435      351  3146
#>     6       57      25      28      30      17      11       10   178
#>     7      344     283     194     149     106      91       40  1207
#>     Sum   1816    1816    1816    1817    1815    1817     1210 12107

## Sampling schemes

Here we set up the sampling designs and define all the quantities we will need for the rest of the vignette. The sample are selected with simple random sampling without replacement and the weights are equal to the inverse of the inclusion probabilities.


# size of sample
n1 <- 1000
n2 <- 500

# samples
s1 <- srswor(n1,N)
s2 <- srswor(n2,N)

# extract matching units
X1 <- Xm[s1 == 1,]
X2 <- Xm[s2 == 1,]

# extract variable of interest
Y1 <- data.frame(Y[s1 == 1,])
colnames(Y1) <- colnames(Y)
Z2 <- as.data.frame(Z[s2 == 1,])
colnames(Z2) <- colnames(Z)

# extract correct identities
id1 <- id[s1 == 1]
id2 <- id[s2 == 1]

# put correct rownames
rownames(Y1) <- id1
rownames(Z2) <- id2

# here weights are inverse of inclusion probabilities
d1 <- rep(N/n1,n1)
d2 <- rep(N/n2,n2)

# disjunctive form
Y_dis <- sampling::disjunctive(as.matrix(Y))
Z_dis <- sampling::disjunctive(as.matrix(Z))

Y1_dis <- Y_dis[s1 ==1,]
Z2_dis <- Z_dis[s2 ==1,]

## Harmonization

Then the harmonization step must be performed. The harmonize function returns the harmonized weights. If by chance the true population totals are known, it is possible to use these instead of the estimate made within the function.



re <- harmonize(X1,d1,id1,X2,d2,id2)

# if we want to use the population totals to harmonize we can use
re <- harmonize(X1,d1,id1,X2,d2,id2,totals = c(N,colSums(Xm)))

w1 <- re$w1 w2 <- re$w2

colSums(Xm)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915
colSums(w1*X1)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915
colSums(w2*X2)
#>      1      2      3      4      5      6      7      8      9     10     11
#>    476    887   2340    763   1880   1021   2244   1938    558   6263   5844
#>     12     13     14  hsize    age
#>  11073    283    751  36380 559915

## Optimal transport matching

The statistical matching is done by using the otmatch function. The estimation of the contingency table is calculated by extracting the id1 units (respectively id2 units) and by using the function tapply with the correct weights.


# Optimal transport matching
object <- otmatch(X1,id1,X2,id2,w1,w2)
#>         id1    id2     weight
#> 401     401 483102 14.9080488
#> 1801   1801 371901 13.0680348
#> 2802   2802  30101  0.6713613
#> 2802.1 2802 303401  3.9807038
#> 2802.2 2802 597501  8.4273587
#> 3501   3501 339003  5.4671218

Y1_ot <- cbind(X1[as.character(object$id1),],y = Y1[as.character(object$id1),])
Z2_ot <- cbind(X2[as.character(object$id2),],z = Z2[as.character(object$id2),])
YZ_ot <- tapply(object$weight,list(Y1_ot$y,Z2_ot$z),sum) # transform NA into 0 YZ_ot[is.na(YZ_ot)] <- 0 # result round(addmargins(YZ_ot),3) #> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum #> 1 798.837 608.164 635.804 817.795 943.154 805.084 457.629 5066.467 #> 2 243.294 136.607 147.983 177.563 215.863 175.939 99.867 1197.117 #> 3 109.862 122.810 89.308 99.774 93.710 138.813 8.098 662.375 #> 4 129.711 76.272 71.171 98.921 94.545 63.649 25.722 559.991 #> 5 654.381 318.511 362.393 488.008 488.052 402.352 407.157 3120.855 #> 6 34.198 27.365 45.834 71.793 13.557 48.386 16.290 257.423 #> 7 221.135 184.752 193.826 214.999 221.704 142.417 63.937 1242.771 #> Sum 2191.419 1474.481 1546.320 1968.853 2070.585 1776.640 1078.701 12107.000 ## Balanced sampling As you can see from the previous section, the optimal transport results generally do not have a one-to-one match, meaning that for every unit in sample 1, we have more than one unit with weights not equal to 0 in sample 2. The bsmatch function creates a one-to-one match by selecting a balanced stratified sampling to obtain a data.frame where each unit in sample 1 has only one imputed unit from sample 2.  # Balanced Sampling BS <- bsmatch(object) head(BS$object[,1:3])
#>         id1    id2    weight
#> 401     401 483102 14.908049
#> 1801   1801 371901 13.068035
#> 2802.1 2802 303401  3.980704
#> 3501.1 3501 570904  7.023456
#> 4001   4001 425101  6.044974
#> 5201   5201 215302 12.230852

Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),])
Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),])
YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum) YZ_bs[is.na(YZ_bs)] <- 0 round(addmargins(YZ_bs),3) #> (0,15] (15,30] (30,45] (45,60] (60,75] (75,90] (90,100] Sum #> 1 852.388 579.251 620.583 798.392 936.224 826.174 453.455 5066.467 #> 2 239.351 136.121 164.141 190.650 197.252 148.271 121.331 1197.117 #> 3 113.757 114.573 79.136 94.826 96.779 148.434 14.869 662.375 #> 4 133.345 74.763 62.515 93.961 101.461 69.539 24.406 559.991 #> 5 677.249 329.620 381.421 488.708 462.650 375.320 405.888 3120.855 #> 6 35.026 27.932 39.346 71.604 13.557 46.307 23.650 257.423 #> 7 234.259 184.017 199.352 216.231 208.327 140.950 59.634 1242.771 #> Sum 2285.374 1446.278 1546.496 1954.372 2016.250 1754.997 1103.234 12107.000 # With Z2 as auxiliary information for stratified balanced sampling. BS <- bsmatch(object,Z2) Y1_bs <- cbind(X1[as.character(BS$object$id1),],y = Y1[as.character(BS$object$id1),]) Z2_bs <- cbind(X2[as.character(BS$object$id2),],z = Z2[as.character(BS$object$id2),]) YZ_bs <- tapply(BS$object$weight/BS$q,list(Y1_bs$y,Z2_bs$z),sum)
YZ_bs[is.na(YZ_bs)] <- 0
#>       (0,15]  (15,30]  (30,45]  (45,60]  (60,75]  (75,90] (90,100]       Sum
#> 1    736.518  591.620  678.617  845.382  946.135  815.990  452.206  5066.467
#> 2    272.931  138.393  113.688  194.099  245.541  149.761   82.704  1197.117
#> 3    102.668  114.573   79.136   93.842   96.779  160.508   14.869   662.375
#> 4    133.345  100.536   88.432  104.760   65.250   32.060   35.608   559.991
#> 5    664.601  331.514  326.147  481.066  476.456  429.956  411.115  3120.855
#> 6     35.564   38.092   39.346   61.444   13.557   57.740   11.679   257.423
#> 7    242.581  163.849  223.569  192.967  219.221  128.421   72.163  1242.771
#> Sum 2188.207 1478.578 1548.936 1973.560 2062.939 1774.437 1080.343 12107.000

## Prediction


# split the weight by id1
q_l <- split(object$weight,f = object$id1)
# normalize in each id1
q_l <- lapply(q_l, function(x){x/sum(x)})
q <- as.numeric(do.call(c,q_l))

Z_pred <- t(q*disjunctive(object$id1))%*%disjunctive(Z2[as.character(object$id2),])
colnames(Z_pred) <- levels(factor(Z2\$c_income))
#> [6,] 0.0000000 0.0000000 0.00000000 0.0000000       0 1.0000000        0