Course set solutions – thematic course on dataset construction
The script below demonstrates the different types of data sets you can create, and how to convert from one data structure to another. Furthermore, it is shown how to aggregate data up to a higher unit level, how to create random samples, and finally different ways to recode variables.
This was reviewed in our theme course, which was run twice in 2022. Click here for more about our courses.
textblock
Theme Course on Building Datasets
-------------------------------
endblock
//Connect to the databank
require no.ssb.fdb:18 as db
textblock
A) How to create different types of datasets
endblock
//Creates a regular wide-dataset consisting of a 1% sample of all residents per 1/1 2018
create-dataset widedata
import db/BEFOLKNING_STATUSKODE 2018-01-01 as regstat18
keep if regstat18 == '1'
sample 0.01 333
import db/BEFOLKNING_STATUSKODE 2019-01-01 as regstat19
import db/BEFOLKNING_STATUSKODE 2020-01-01 as regstat20
import db/SIVSTANDFDT_SIVSTAND 2018-01-01 as civil_status18
import db/SIVSTANDFDT_SIVSTAND 2019-01-01 as civil_status19
import db/SIVSTANDFDT_SIVSTAND 2020-01-01 as civil_status20
import db/BEFOLKNING_KJOENN as gender
import db/INNTEKT_WLONN 2018-01-01 as salary18
import db/INNTEKT_WLONN 2019-01-01 as salary19
import db/INNTEKT_WLONN 2020-01-01 as salary20
tabulate civil_status18, missing
tabulate civil_status19, missing
tabulate civil_status20, missing
tabulate gender, missing
summarize salary18 salary19 salary20
//Creates a corresponding dataset in long-format
clone-units widedata longdata
use longdata
import-panel db/BEFOLKNING_STATUSKODE db/SIVSTANDFDT_SIVSTAND db/INNTEKT_WLONN db/BEFOLKNING_KJOENN 2018-01-01 2019-01-01 2020-01-01
rename BEFOLKNING_STATUSKODE regstat
rename SIVSTANDFDT_SIVSTAND civil_status
rename INNTEKT_WLONN salary
rename BEFOLKNING_KJOENN gender
tabulate-panel civil_status, missing
tabulate-panel gender, missing
summarize-panel salary
generate year = year(date@panel)
tabulate civil_status year, missing
textblock
B1) How to change data structure from wide to long
endblock
use widedata
reshape-to-panel regstat civil_status salary
//Tests if it matches the counts on the wide-dataset
tabulate date@panel, missing
tabulate-panel regstat, missing
tabulate-panel civil_status, missing
tabulate-panel regstat civil_status, missing
tabulate-panel gender, missing
tabulate-panel regstat gender, missing
tabulate-panel civil_status gender, missing
summarize salary
summarize-panel salary
textblock
B2) How to change data structure from long to wide
endblock
reshape-from-panel
drop gender20
rename gender19 gender
tabulate regstat19, missing
tabulate regstat20, missing
tabulate civil_status19, missing
tabulate civil_status20, missing
tabulate gender, missing
summarize salary19 salary20
//How to do it when you have a long-dataset created with import-panel
use longdata
tabulate-panel regstat, missing
tabulate-panel civil_status, missing
tabulate-panel gender, missing
summarize-panel salary
reshape-from-panel
drop gender18262
rename gender17897 gender
rename regstat17897 regstat19
rename regstat18262 regstat20
rename civil_status17897 civil_status19
rename civil_status18262 civil_status20
rename salary17897 salary19
rename salary18262 salary20
summarize salary19 salary20
textblock
C) How to Aggregate Data
endblock
//Example 1: Event level -> Person level
//Creates a population consisting of all residents in Norway as of 2019-01-01
create-dataset totalpop
import db/BEFOLKNING_KJOENN as gender
import db/BEFOLKNING_STATUSKODE 2019-01-01 as regstat
keep if regstat == '1'
//Finds people with status as divorced during the year before (2018)
create-dataset event
import-event db/SIVSTANDFDT_SIVSTAND 2018-01-01 to 2018-12-31 as marital_status
keep if marital_status == '4'
collapse (count) marital_status, by(PERSONID_1 )
rename marital_status times_divorced
tabulate times_divorced
//Connects divorce information to the dataset with all residents and creates a dummy indicating status = divorced
merge times_divorced into totalpop
use totalpop
generate divorced2018 = 0
replace divorced2018 = 1 if times_divorced >= 1
tabulate divorced2018, cellpct
piechart divorced2018
//Example 2: Person level -> Family level
//First creates a person dataset for people in families consisting of married couples with small children
create-dataset persondata
import db/BEFOLKNING_REGSTAT_FAMTYP 2017-01-01 as famtype
tabulate famtype
keep if famtype == '2.1.1'
//Adds various demographic information
import db/BEFOLKNING_KJOENN as gender
import db/BEFOLKNING_FOEDSELS_AAR_MND as birth_year_month
generate age = 2017 - int(birth_year_month/100)
import db/BEFOLKNING_KOMMNR_FAKTISK 2017-01-01 as residence
generate county = substr(residence, 1, 2)
import db/BEFOLKNING_BARN_I_HUSH 2017-01-01 as num_children
//Creates dataset for generating total professional income per family => unit = family
create-dataset familydata
import db/BEFOLKNING_REGSTAT_FAMNR 2017-01-01 as famnr
import db/INNTEKT_WYRKINNT 2017-01-01 as professional_income
collapse (sum) professional_income, by(famnr)
rename professional_income family_income
//Connects family income to the person dataset (unit = persons)
merge family_income into persondata on PERSONID_1
//Creates family statistics. The family number consists of the person-id of the oldest person in the family, so when one removes individuals with missing family income one is left with a dataset with the family as a unit. All personal information will then apply to the oldest person in the family
use persondata
drop if sysmiss(family_income)
rename age age_oldest
rename gender gender_oldest
define-labels county_text '01' Østfold '02' Akershus '03' Oslo '04' Hedmark '05' Oppland '06' Buskerud '07' Vestfold '08' Telemark '09' 'Aust-Agder' '10' 'Vest-Agder' '11' Rogaland '12' Hordaland '14' 'Sogn og Fjordane' '15' 'Møre og Romsdal' '16' 'Sør-Trøndelag' '17' 'Nord-Trøndelag' '18' Nordland '19' Troms '20' Finnmark '21' Spitsbergen '25' 'Studying abroad' '99' Unknown
assign-labels county county_text
tabulate county
histogram age_oldest, discrete
histogram num_children, discrete percent
tabulate num_children
tabulate num_children, cellpct
tabulate num_children gender_oldest
summarize family_income
barchart (mean) family_income, by(county)
barchart (mean) family_income, by(num_children)
histogram family_income, freq
histogram family_income, by(num_children) percent
//Example 3: Person level -> Municipality level
//Creates person dataset
create-dataset persons
import db/BEFOLKNING_KOMMNR_FAKTISK 2019-01-01 as municipality
import db/BEFOLKNING_KJOENN as gender
import db/INNTEKT_WLONN 2019-01-01 as salary
import db/INNTEKT_BER_BRFORM 2019-01-01 as fortune
summarize salary fortune
histogram salary, freq
histogram fortune, freq
//Creates average salary and average fortune per municipality and connects to person dataset
create-dataset municipaldata_salary_fortune
import db/BEFOLKNING_KOMMNR_FAKTISK 2019-01-01 as municipality
import db/INNTEKT_WLONN 2019-01-01 as salary
import db/INNTEKT_BER_BRFORM 2019-01-01 as fortune
collapse (mean) salary fortune, by(municipality)
rename salary avg_salary_municipality
rename fortune avg_fortune_municipality
summarize avg_salary_municipality avg_fortune_municipality
merge avg_salary_municipality avg_fortune_municipality into persons on municipality
//Creates number of unemployed and residents per municipality and connects to person dataset
create-dataset municipaldata_unemployed_residents
import db/BEFOLKNING_KOMMNR_FAKTISK 2019-01-01 as municipality
import db/ARBSOEK2001FDT_HOVED 2019-01-01 as as_status
generate num_unemployed = 1 if as_status == '1'
generate num_residents = 1
collapse (sum) num_unemployed num_residents, by(municipality)
summarize num_unemployed num_residents
merge num_unemployed num_residents into persons on municipality
//Finally uses the person dataset to perform a simple regression analysis
use persons
generate unemployed_pct = (num_unemployed / num_residents) * 100
generate male = 0
replace male = 1 if gender == '1'
generate oslo = 0
replace oslo = 1 if municipality == '0301'
generate large_municipality = 0
replace large_municipality = 1 if num_residents > 80000
generate rich_municipality = 0
replace rich_municipality = 1 if avg_fortune_municipality > 2000000
regress salary male oslo fortune avg_fortune_municipality avg_salary_municipality unemployed_pct num_residents
regress salary male oslo fortune rich_municipality avg_salary_municipality unemployed_pct large_municipality
textblock
D) Create Random Samples
endblock
//Creates a dataset with all residents in Norway as of 1/1 2019 and then draws a 10% sample of these
create-dataset totalpop1
import db/BEFOLKNING_STATUSKODE 2019-01-01 as registerstatus19
keep if registerstatus19 == '1'
sample 0.1 999
//Creates a dataset with all residents in Norway as of 1/1 2019 and then draws a sample of 5000 individuals
create-dataset totalpop2
import db/BEFOLKNING_STATUSKODE 2019-01-01 as registerstatus19
keep if registerstatus19 == '1'
sample 5000 888
//Creates a dataset with all residents in Norway as of 1/1 2019 and then draws a sample of 5000 new individuals (different from the previous draw)
create-dataset totalpop3
import db/BEFOLKNING_STATUSKODE 2019-01-01 as registerstatus19
keep if registerstatus19 == '1'
sample 5000 950
textblock
E) Different Ways to Code/Recoding Variables
endblock
//Extract parts of variable code
create-dataset demographicdata1
import db/BEFOLKNING_KOMMNR_FAKTISK 2017-01-01 as residence
//Codes from municipality to county level
generate county = substr(residence, 1, 2)
//Adds value labels to name counties by name (=> prettier descriptive output)
define-labels counties_string '01' Østfold '02' Akershus '03' Oslo '04' Hedmark '05' Oppland '06' Buskerud '07' Vestfold '08' Telemark '09' 'Aust-Agder' '10' 'Vest-Agder' '11' Rogaland '12' Hordaland '14' 'Sogn og Fjordane' '15' 'Møre og Romsdal' '16' 'Sør-Trøndelag' '17' 'Nord-Trøndelag' '18' Nordland '19' Troms '20' Finnmark '21' Spitsbergen '25' 'Education abroad' '99' Unknown
assign-labels county counties_string
tabulate county
//Extract parts of numerical variable values
create-dataset demographicdata2
import db/BEFOLKNING_FOEDSELS_AAR_MND as birth_year_month
//Calculates age in 2020 from birth year
generate age = 2020 - int(birth_year_month / 100)
//Dummy
create-dataset demographicdata3
import db/BEFOLKNING_KJOENN as gender
//Creates a dummy variable indicating male from the gender variable
generate male = 0
replace male = 1 if gender == '1'
tabulate male
//Multiple categories
create-dataset demographicdata4
import db/INNTEKT_BRUTTOFORM 2019-12-31 as fortune
//Group fortune into 4 intervals
generate fortune_int = 1
replace fortune_int = 2 if fortune > 500000
replace fortune_int = 3 if fortune > 1000000
replace fortune_int = 4 if fortune > 1500000
tabulate fortune_int
//recode - dummy
create-dataset demographicdata5
import db/BEFOLKNING_KJOENN as gender
destring gender
tabulate gender
recode gender (2 = 0)
rename gender male
tabulate male, nolabels
//recode - multiple categories
create-dataset population
import db/BEFOLKNING_STATUSKODE 2020-01-01 as statuscode
keep if statuscode == '1'
import db/BEFOLKNING_FODELAND as birth_country
tabulate birth_country
//Creates copies of the birth_country variable to demonstrate different types of recoding, and to control the recoding
clone-variables birth_country -> birth_country2
clone-variables birth_country -> birth_country_orig
//Recodes birth_country to world region - first uses destring to convert the format to numeric
destring birth_country
recode birth_country (111 120 138 139 140 148 155 156 159/164 = 2) (101/141 144/158 = 1) (203/393 = 3) (143 404/578 = 4) (612 684 = 5) (601/775 = 6) (802/840 = 7) (980 = 8) (990 = 9)
define-labels vregion 0 'Norway' 1 'EU/EEA' 2 'European countries outside the EU' 3 'Africa' 4 'Asia including Turkey' 5 'North America' 6 'South and Central America' 7 'Oceania' 8 'Stateless' 9 'Unknown'
assign-labels birth_country vregion
tabulate birth_country
tabulate birth_country_orig birth_country
//Recodes birth_country to world region and creates labels in the same expression
destring birth_country2
recode birth_country2 (111 120 138 139 140 148 155 156 159/164 = 2 'European countries outside the EU') (101/141 144/158 = 1 'EU/EEA') (203/393 = 3 'Africa') (143 404/578 = 4 'Asia including Turkey') (612 684 = 5 'North America') (601/775 = 6 'South and Central America') (802/840 = 7 'Oceania') (980 = 8 'Stateless') (990 = 9 'Unknown')
tabulate birth_country2
//recode - alphanumeric variables
create-dataset demographicdata6
import db/SIVSTANDFDT_SIVSTAND 2019-01-01 as marital_status
tabulate marital_status
recode marital_status ('1' = '1')('2' = '2')(* = '3' 'Other')
tabulate marital_status