Skip to main content

Course set solutions – beginners course

The script below shows how to solve the tasks in the course set that is used for the ordinary introductory course that Sikt and SSB run at regular intervals. Click here for more about our courses.

 // Building a dataset
require no.ssb.fdb:25 as ds

create-dataset totalpop

// Importing birth year and month data, calculating age
import ds/BEFOLKNING_FOEDSELS_AAR_MND as birth_ym
generate age = 2021 - int(birth_ym/100)
histogram age, discrete

// Importing register status data and keeping only those with status '1'
import ds/BEFOLKNING_STATUSKODE 2021-01-01 as regstat
keep if regstat == '1'

// Displaying age distribution and summary statistics
histogram age, discrete
summarize age

// Keeping only those between 30 and 50 years old
keep if age > 30 & age < 50

// Importing various demographic and economic data
import ds/BEFOLKNING_KJOENN as gender
import ds/BEFOLKNING_FODELAND as country
import ds/NUDB_BU 2021-07-31 as edu
import ds/BEFOLKNING_KOMMNR_FAKTISK 2021-01-01 as residence
import ds/INNTEKT_LONN 2017-12-31 as salary17
import ds/INNTEKT_LONN 2018-12-31 as salary18
import ds/INNTEKT_LONN 2019-12-31 as salary19
import ds/INNTEKT_LONN 2020-12-31 as salary20
import ds/INNTEKT_LONN 2021-12-31 as salary21

// Running descriptive statistics

// Unidimensional
summarize salary17 salary18 salary19 salary20 salary21
summarize salary17 salary18 salary19 salary20 salary21, gini

// Creating various bar charts and histograms for salaries
barchart (mean) salary17 salary18 salary19 salary20 salary21
barchart (count) salary17 salary18 salary19 salary20 salary21
barchart (median) salary17 salary18 salary19 salary20 salary21

histogram salary21, freq
histogram salary21, freq normal
histogram salary21, bin(10)
histogram salary21, bin(4)

// Two-dimensional

// Creating new variables for nationality and education level
generate norwegian = 0
replace norwegian = 1 if country == '000'
tabulate norwegian
tabulate norwegian, cellpct
piechart norwegian

generate edu_level = substr(edu,1,1)
tabulate edu_level, cellpct
destring edu_level

// Running descriptive statistics for different education levels
summarize salary17 salary18 salary19 salary20 salary21 if edu_level < 2
summarize salary17 salary18 salary19 salary20 salary21 if edu_level > 6

// Creating cross-tabulations and bar charts for gender, nationality and education level
tabulate gender, summarize(salary21)
tabulate norwegian, summarize(salary21)
tabulate edu_level, summarize(salary21)
tabulate edu_level gender, summarize(salary21)

barchart (mean) salary17 salary18 salary19 salary20 salary21, over(gender)
barchart (mean) salary17 salary18 salary19 salary20 salary21, over(norwegian)
barchart (mean) salary17 salary18 salary19 salary20 salary21, over(edu_level)

// Demonstrating categorization of occupations and labels

import ds/REGSYS_ARB_YRKE_STYRK08 2021-11-16 as occupation

// Categorizing occupations into groups
generate occupation_group = 9
replace occupation_group = 1 if substr(occupation,1,1) == '1'
replace occupation_group = 2 if occupation == '2211'
replace occupation_group = 3 if occupation == '2212'
replace occupation_group = 4 if occupation == '2223'
replace occupation_group = 5 if substr(occupation,1,2) == '23'
replace occupation_group = 6 if substr(occupation,1,2) == '25'
replace occupation_group = 7 if substr(occupation,1,2) == '61'
replace occupation_group = 999 if sysmiss(occupation)

define-labels occupation_label 1 Leaders 2 'General practitioners' 3 'Medical specialists' 4 Nurses 5 Teachers 6 'IT developers' 7 Agriculture 9 Other 999 Unemployed
assign-labels occupation_group occupation_label 

tabulate occupation_group
tabulate occupation_group, cellpct
tabulate occupation_group, summarize(salary21)
barchart (mean) salary21, over(occupation_group)
barchart (mean) salary17 salary18 salary19 salary20 salary21, over(occupation_group)

// Run family links

import ds/BEFOLKNING_FAR_FNR as father_id
import ds/BEFOLKNING_MOR_FNR as mother_id

create-dataset parents
import ds/INNTEKT_LONN 2021-12-31 as salary21_father
import ds/REGSYS_ARB_YRKE_STYRK08 2021-11-16 as occupation_father
import ds/NUDB_BU 2021-07-31 as edu_father
generate edu_level_father = substr(edu_father,1,1)
destring edu_level_father, force

clone-variables salary21_father -> salary21_mother
clone-variables occupation_father -> occupation_mother
clone-variables edu_level_father -> edu_level_mother

merge salary21_father occupation_father edu_level_father into totalpop on father_id
merge salary21_mother occupation_mother edu_level_mother into totalpop on mother_id

use totalpop
summarize salary21 salary21_father salary21_mother
correlate salary21 salary21_father
correlate salary21 salary21_mother
correlate salary21_father salary21_mother
correlate edu_level edu_level_father
correlate edu_level edu_level_mother
correlate edu_level_father edu_level_mother

correlate salary21 salary21_father if gender == '1'
correlate salary21 salary21_mother if gender == '2'
correlate edu_level edu_level_father if gender == '1'
correlate edu_level edu_level_mother if gender == '2'

generate occupation_group_father = 9
replace occupation_group_father = 1 if substr(occupation_father,1,1) == '1'
replace occupation_group_father = 2 if occupation_father == '2211'
replace occupation_group_father = 3 if occupation_father == '2212'
replace occupation_group_father = 4 if occupation_father == '2223'
replace occupation_group_father = 5 if substr(occupation_father,1,2) == '23'
replace occupation_group_father = 6 if substr(occupation_father,1,2) == '25'
replace occupation_group_father = 7 if substr(occupation_father,1,2) == '61'
replace occupation_group_father = 999 if sysmiss(occupation_father)

assign-labels occupation_group_father occupation_label
tabulate occupation_group_father occupation_group if occupation_group_father < 9, rowpct
tabulate occupation_group_father occupation_group if occupation_group_father < 9 & gender == '1', rowpct


// Finally, run some simple regressions: regress and logit

generate oslo = 1 if residence == '0301'
replace oslo = 0 if residence != '0301'
tabulate oslo, cellpct

generate high_edu = 1 if edu_level >= 7
replace high_edu = 0 if edu_level >= 0 & edu_level < 7

regress salary21 age norwegian oslo i.gender high_edu salary21_father
coefplot regress salary21 age norwegian oslo i.gender high_edu salary21_father
regress-predict salary21 age norwegian oslo i.gender high_edu salary21_father, residuals(res)
histogram res
histogram res, normal

generate high_salary = 0
replace high_salary = 1 if salary21 > 800000

generate high_salary_father = 0
replace high_salary_father = 1 if salary21_father > 800000

logit high_salary age norwegian oslo i.gender high_edu high_salary_father
coefplot logit high_salary age norwegian oslo i.gender high_edu high_salary_father


// Event extraction: Identifying individuals with divorced status during 2021
create-dataset event
import-event ds/SIVSTANDFDT_SIVSTAND 2021-01-01 to 2021-12-31 as marital_event
keep if marital_event == '4'
collapse (count) marital_event -> num_times_divorced, by(PERSONID_1 )
tabulate num_times_divorced
merge num_times_divorced into totalpop

use totalpop
generate divorced2021 = 0
replace divorced2021 = 1 if num_times_divorced >= 1
tabulate divorced2021, cellpct freq