Codebooks with retroharmonize • eurobarometer

library(eurobarometer)

codebook_vignette_dir <- file.path(tempdir(), "codebook_vignette")
if(!dir.exists(codebook_vignette_dir)) {
  dir.create(codebook_vignette_dir)
}

This code created the /tmp/RtmpWUnMaO/codebook_vignette directory. You can replicate this vignette with the code above, or you can use your own directory for trying the codebook_create() function.

The ‘eurobarometer’ R packagage contains two, very small subsamples of Eurobarometer microdata file. You cannot work in the system.file() of the package, so we will copy the contents of this directory to the temporary folder.

sample_files <- dir(system.file("extdata", package = "eurobarometer"))
sample_files
#> [1] "ZA4529_sample.sav" "ZA5933_sample.sav"

sytem_file_paths <- file.path(
  system.file("extdata", package = "eurobarometer"), 
  dir(system.file("extdata", package = "eurobarometer"))
  )

for (i in seq_along(sample_files)) {
  file.copy(sytem_file_paths[i], file.path(codebook_vignette_dir, sample_files[i]), 
            overwrite=TRUE)
}

The resulting files can be found here:

sample_files <- file.path(codebook_vignette_dir, dir(codebook_vignette_dir))
sample_files
#> [1] "/tmp/RtmpWUnMaO/codebook_vignette/ZA4529_sample.sav"
#> [2] "/tmp/RtmpWUnMaO/codebook_vignette/ZA5933_sample.sav"

Create A Survey Codebook

Let’s start with ZA5933_sample.sav, because it is a better labelled source file.

sample_data <- read_sav_gesis(
  file = sample_files[2])

Let’s see the files contents:

simple_codebook <- codebook_create(sample_data, val_labels = FALSE, freq = FALSE)
simple_codebook
#> Codebook for Eurobarometer 82.4 (November-December 2014) [10.4232/1.13044] 
#>             survey  ZACAT var_name_orig                      var_label_orig
#> 1  10.4232/1.13044 ZA5933           uri                                <NA>
#> 2  10.4232/1.13044 ZA5933      studyno1  ARCHIVE STUDY NUMBER - DISTRIBUTOR
#> 3  10.4232/1.13044 ZA5933      studyno2    ARCHIVE STUDY NUMBER - PUBLISHER
#> 4  10.4232/1.13044 ZA5933           doi                                <NA>
#> 5  10.4232/1.13044 ZA5933       edition                     DATASET EDITION
#> 6  10.4232/1.13044 ZA5933        survey               SURVEY IDENTIFICATION
#> 7  10.4232/1.13044 ZA5933        caseid                                <NA>
#> 8  10.4232/1.13044 ZA5933        uniqid                                <NA>
#> 9  10.4232/1.13044 ZA5933      tnscntry      ORIGINAL TNS COUNTRY/SAMPLE ID
#> 10 10.4232/1.13044 ZA5933       country COUNTRY/SAMPLE ID (SERIES STANDARD)
#> 
#> ... 17 further observations.

The simplified codebook is returned in a dataset, an s3 class of dataset package that adds important structural and referencial metadata to an R data.frame.

The get_zacat_data_url(simple_codebook$ZACAT[1]) command will return the URL of the original dataset, https://search.gesis.org/research_data/ZA5933.

The entire codebook of the sample file would be very large to show, so we subset the sample_data and create a more detailed codebook for only the d25 and d60 variables.

full_codebook <- codebook_create(sample_data[, c("d25", "d60")], 
                                 val_labels = TRUE, 
                                 freq = TRUE)
full_codebook
#> Codebook for Eurobarometer 82.4 (November-December 2014) (subset) [10.4232/1.13044] 
#>            survey var_name_orig                        var_label_orig
#> 1 10.4232/1.13044           d25                     TYPE OF COMMUNITY
#> 2 10.4232/1.13044           d25                     TYPE OF COMMUNITY
#> 3 10.4232/1.13044           d25                     TYPE OF COMMUNITY
#> 4 10.4232/1.13044           d25                     TYPE OF COMMUNITY
#> 5 10.4232/1.13044           d60 DIFFICULTIES PAYING BILLS - LAST YEAR
#> 6 10.4232/1.13044           d60 DIFFICULTIES PAYING BILLS - LAST YEAR
#> 7 10.4232/1.13044           d60 DIFFICULTIES PAYING BILLS - LAST YEAR
#> 8 10.4232/1.13044           d60 DIFFICULTIES PAYING BILLS - LAST YEAR
#>   val_code_orig             val_label_orig
#> 1             1      Rural area or village
#> 2             2 Small or middle sized town
#> 3             3                 Large town
#> 4             8                         DK
#> 5             1           Most of the time
#> 6             2          From time to time
#> 7             3         Almost never/never
#> 8             7           Refusal (SPONT.)

full_codebook[full_codebook$var_name_orig=="d25", c("val_label_orig")]
#> [1] "Rural area or village"      "Small or middle sized town"
#> [3] "Large town"                 "DK"

Work with Multiple Surveys

# Create the joint codebook for multiple files
joint_codebook <- codebook_create(directory = codebook_vignette_dir, 
                                  val_labels = FALSE, 
                                  freq = FALSE)
#> Reading ZA4529_sample.sav
#> Reading ZA5933_sample.sav
#> Warning in data.frame(survey = rep(as.character(unlist(identifier)), df_rows), :
#> NAs introduced by coercion

# Create a random row selection
set.seed(2022)
random_rows <- c(1, sort(round(runif(11, 2, nrow(joint_codebook)),0)))

# Print only the 12 rows 
joint_codebook[random_rows, ]
#> Codebook for Multiple Surveys (subset)
#>                survey  ZACAT var_name_orig
#> 1     10.4232/1.10983 ZA4529           uri
#> 12    10.4232/1.10983 ZA4529            v3
#> 24    10.4232/1.10983 ZA4529            v6
#> 34    10.4232/1.10983 ZA4529           v95
#> 216   10.4232/1.10983 ZA4529          v100
#> 316   10.4232/1.10983 ZA4529          v724
#> 518   10.4232/1.13044 ZA5933      tnscntry
#> 11101 10.4232/1.13044 ZA5933            d7
#> 262   10.4232/1.13044 ZA5933          nuts
#> 3311  10.4232/1.13044 ZA5933          nuts
#>                                var_label_orig val_code_orig
#> 1                                        <NA>            NA
#> 12                                       <NA>            NA
#> 24                       NATION - ALL SAMPLES            24
#> 34            QA4 CULTURAL ACTIVITIES: CINEMA             3
#> 216   QA4 CULTURAL ACTIVITIES: HIST MONUMENTS             2
#> 316                          D8 AGE EDUCATION            97
#> 518            ORIGINAL TNS COUNTRY/SAMPLE ID             6
#> 11101                          MARITAL STATUS            11
#> 262                       REGION - NUTS CODES            NA
#> 3311                      REGION - NUTS CODES            NA
#>                                    val_label_orig
#> 1                                            <NA>
#> 12                                           <NA>
#> 24                                      Lithuania
#> 34                                      3-5 times
#> 216                                     1-2 times
#> 316                        No full-time education
#> 518                                         SUOMI
#> 11101 Divorced/Separated: living without children
#> 262                                    Thueringen
#> 3311                         South East [England]
#> 
#> ... 2 further observations.