Chapter 1

cdf-ex1.R
pbinom(3, 10, 0.2)
cdf-ex2.R
pnorm(1.96) - pnorm(-1.96)
Critical-Values-t.R
# degrees of freedom = n-1:
df <- 19
# significance levels:
alpha.one.tailed = c(0.1, 0.05, 0.025, 0.01, 0.005, .001)
alpha.two.tailed = alpha.one.tailed * 2

# critical values & table:
CV <- qt(1 - alpha.one.tailed, df)
cbind(alpha.one.tailed, alpha.two.tailed, CV)
Data-frames-subsets.R
# Full data frame (from Data-frames.R, has to be run first)
sales

# Subset: all years in which sales of product 3 were >=3
subset(sales, product3>=3)
Data-frames-vars.R
# Accessing a single variable:
sales$product2

# Generating a new  variable in the data frame:
sales$totalv1 <- sales$product1 + sales$product2 + sales$product3 

# The same but using "with":
sales$totalv2 <- with(sales, product1+product2+product3)

# The same but using "attach":
attach(sales)
sales$totalv3 <- product1+product2+product3
detach(sales)

# Result:
sales
Data-frames.R
# Define one x vector for all:
year     <- c(2008,2009,2010,2011,2012,2013)
# Define a matrix of y values:
product1<-c(0,3,6,9,7,8); product2<-c(1,2,3,5,9,6); product3<-c(2,4,4,2,3,2)
sales_mat <- cbind(product1,product2,product3)
rownames(sales_mat) <- year
# The matrix looks like this:
sales_mat

# Create a data frame and display it:
sales <- as.data.frame(sales_mat)
sales
Descr-Stats.R
data(ceosal1, package='wooldridge')

# sample average:
mean(ceosal1$salary)
# sample median:
median(ceosal1$salary)
#standard deviation:
sd(ceosal1$salary)
# summary information:
summary(ceosal1$salary)

# correlation with ROE:
cor(ceosal1$salary, ceosal1$roe)
Descr-Tables.R
# load data set

data(affairs, package='wooldridge')

# Generate "Factors" to attach labels
haskids <- factor(affairs$kids,labels=c("no","yes"))
mlab <- c("very unhappy","unhappy","average","happy", "very happy")
marriage <- factor(affairs$ratemarr, labels=mlab)

# Frequencies for having kids:
table(haskids)
# Marriage ratings (share):
prop.table(table(marriage))

# Contigency table: counts (display & store in var.)
(countstab <- table(marriage,haskids))

# Share within "marriage" (i.e. within a row):
prop.table(countstab, margin=1)
# Share within "haskids"  (i.e. within a column):
prop.table(countstab, margin=2)
Example-B-6-2.R
1 - pnorm(2,4,3) + pnorm(-2,4,3)
Example-B-6.R
# Using the transformation:
pnorm(2/3) - pnorm(-2/3)
# Working directly with the distribution of X:
pnorm(6,4,3) - pnorm(2,4,3)
Example-C-2.R
# Manually enter raw data from Wooldridge, Table C.3:
SR87<-c(10,1,6,.45,1.25,1.3,1.06,3,8.18,1.67,.98,1,.45,
                                      5.03,8,9,18,.28,7,3.97)
SR88<-c(3,1,5,.5,1.54,1.5,.8,2,.67,1.17,.51,.5,.61,6.7,
                                            4,7,19,.2,5,3.83)
# Calculate Change (the parentheses just display the results):
(Change <- SR88 - SR87)

# Ingredients to CI formula
(avgCh<- mean(Change))
(n    <- length(Change))
(sdCh <- sd(Change))
(se   <- sdCh/sqrt(n))
(c    <- qt(.975, n-1))

# Confidence intervall:
c( avgCh - c*se, avgCh + c*se )
Example-C-3.R
data(audit, package='wooldridge')

# Ingredients to CI formula
(avgy<- mean(audit$y))
(n   <- length(audit$y))
(sdy <- sd(audit$y))
(se  <- sdy/sqrt(n))
(c   <- qnorm(.975))

# 95% Confidence intervall:
avgy + c * c(-se,+se)
# 99% Confidence intervall:
avgy + qnorm(.995) * c(-se,+se)
Example-C-5.R
# Note: we reuse variables from Example-C-3.R. It has to be run first!
# t statistic for H0: mu=0:
(t <- avgy/se)

# Critical values for t distribution with n-1=240 d.f.:
alpha.one.tailed = c(0.1, 0.05, 0.025, 0.01, 0.005, .001)
CV <- qt(1 - alpha.one.tailed, n-1)
cbind(alpha.one.tailed, CV)
Example-C-6.R
# Note: we reuse variables from Example-C-2.R. It has to be run first!
# t statistic for H0: mu=0:
(t <- avgCh/se)

# p value
(p <- pt(t,n-1))
Example-C-7.R
# t statistic for H0: mu=0:
t <-  -4.276816

# p value
(p <- pt(t,240))
Example-Data.R
# The data set is stored on the local computer in 
# ~/data/wooldridge/affairs.dta

# Version 1: from package. make sure to install.packages(wooldridge)
data(affairs, package='wooldridge')

# Version 2: Adjust path
affairs2 <- rio::import("~/data/wooldridge/affairs.dta")

# Version 3: Change working directory
setwd("~/data/wooldridge")
affairs3 <- rio::import("affairs.dta")

# Version 4: directly load from internet
affairs4 <- rio::import("http://fmwww.bc.edu/ec-p/data/wooldridge/affairs.dta")

# Compare, e.g. avg. value of naffairs:
mean(affairs$naffairs)
mean(affairs2$naffairs)
mean(affairs3$naffairs)
mean(affairs4$naffairs)
Examples-C2-C6.R
# data for the scrap rates examples:
SR87<-c(10,1,6,.45,1.25,1.3,1.06,3,8.18,1.67,.98,1,.45,5.03,8,9,18,.28,
                                                                 7,3.97)
SR88<-c(3,1,5,.5,1.54,1.5,.8,2,.67,1.17,.51,.5,.61,6.7,4,7,19,.2,5,3.83)
Change <- SR88 - SR87

# Example C.2: two-sided CI
t.test(Change)
# Example C.6: 1-sided test:
t.test(Change, alternative="less")
Examples-C3-C5-C7.R
data(audit, package='wooldridge')

# Example C.3: two-sided CI
t.test(audit$y)
# Examples C.5 & C.7: 1-sided test:
t.test(audit$y, alternative="less")
Factors.R
# Original ratings:
x <- c(3,2,2,3,1,2,3,2,1,2)
xf <- factor(x, labels=c("bad","okay","good")) 
x
xf
Histogram.R
# Load data
data(ceosal1, package='wooldridge')

# Extract ROE to single vector
ROE <- ceosal1$roe

# Subfigure (a): histogram (counts)
hist(ROE)

# Subfigure (b): histogram (densities, explicit breaks)
hist(ROE, breaks=c(0,5,10,20,30,60) )
invCDF-example.R
qnorm(0.975)
Lists.R
# Generate a list object:
mylist <- list( A=seq(8,36,4), this="that", idm = diag(3))

# Print whole list: 
mylist

# Vector of names:
names(mylist)

# Print component "A":
mylist$A
Logical.R
# Basic comparisons:
0 == 1
0 < 1

# Logical vectors:
( a <- c(7,2,6,9,4,1,3) )

( b <- a<3 | a>=6 )
Matrices.R
# Generating matrix A from one vector with all values:
v <- c(2,-4,-1,5,7,0)
( A <- matrix(v,nrow=2) )

# Generating matrix A from two vectors corresponding to rows:
row1 <- c(2,-1,7); row2 <- c(-4,5,0)
( A <- rbind(row1, row2) )

# Generating matrix A from three vectors corresponding to columns:
col1 <- c(2,-4); col2 <- c(-1,5); col3 <- c(7,0)
( A <- cbind(col1, col2, col3) )

# Giving names to rows and columns:
colnames(A) <- c("Alpha","Beta","Gamma")
rownames(A) <- c("Aleph","Bet") 
A

# Diaginal and identity matrices: 
diag( c(4,2,6) )
diag( 3 )

# Indexing for extracting elements (still using A from above):
A[2,1]
A[,2]
A[,c(1,3)]
Matrix-Operators.R
A <- matrix( c(2,-4,-1,5,7,0), nrow=2)
B <- matrix( c(2,1,0,3,-1,5), nrow=2)
A
B
A*B

# Transpose:
(C <- t(B) )

# Matrix multiplication:
(D <- A %*% C )

# Inverse:
solve(D)
mpg-advanced.R
ggplot(mpg, aes(displ, hwy, color=class, shape=class)) + 
  geom_point() +
  geom_smooth(se=FALSE) +
  scale_color_grey() +
  scale_shape_manual(values=1:7) +
  theme_light() +
  labs(title="Displacement vs. Mileage",
       subtitle="Model years 1988 - 2008",
       caption="Source: EPA through the ggplot2 package",
       x = "Displacement [liters]",
       y = "Miles/Gallon (Highway)",
       color="Car type",
       shape="Car type"
       ) +
  coord_cartesian(xlim=c(0,7), ylim=c(0,45))
  
ggsave("my_ggplot.png", width = 7, height = 5)
mpg-color1.R
ggplot(mpg, aes(displ, hwy)) + 
  geom_point(color=gray(0.5)) +
  geom_smooth(color="black")
mpg-color2.R
ggplot(mpg, aes(displ, hwy)) + 
  geom_point( aes(color=class) ) +
  geom_smooth(color="black") +
  scale_color_grey()
mpg-color3.R
ggplot(mpg, aes(displ, hwy)) + 
  geom_point( aes(color=class, shape=class) ) +
  geom_smooth(color="black") +
  scale_color_grey() +
  scale_shape_manual(values=1:7)
mpg-color4.R
ggplot(mpg, aes(displ, hwy, color=class, shape=class)) + 
  geom_point() +
  geom_smooth(se=FALSE) +
  scale_color_grey() +
  scale_shape_manual(values=1:7)
mpg-data.R
# load package
library(ggplot2)

# First rows of data of data set mpg:
head(mpg)
mpg-regr.R
ggplot(mpg, aes(displ, hwy)) + 
  geom_point() +
  geom_smooth()
mpg-scatter.R
# load package
library(ggplot2)

# Generate ggplot2 graph:
ggplot() + geom_point( data=mpg, mapping=aes(x=displ, y=hwy) )
Objects.R
# generate object x (no output):
x <- 5

# display x & x^2: 
x
x^2

# generate objects y&z with immediate display using ():
(y <- 3)
(z <- y^x)
pmf-ex.R
# Pedestrian approach:
choose(10,2) * 0.2^2 * 0.8^8
# Built-in function:
dbinom(2,10,0.2)
PMF-example.R
# Values for x: all between 0 and 10
x <- seq(0,10)

# pmf for all these values
fx <- dbinom(x, 10, 0.2)

# Table(matrix) of values:
cbind(x, fx) 
# Plot
plot(x, fx, type="h")
R-as-a-Calculator.R
1+1
5*(4-1)^2
sqrt( log(10) )
Random-Numbers.R
# Sample from a standard normal RV with sample size n=5:
rnorm(5)
# A different sample from the same distribution:
rnorm(5)

# Set the seed of the random number generator and take two samples:
set.seed(6254137)
rnorm(5)
rnorm(5)

# Reset the seed to the same value to get the same samples again:
set.seed(6254137)
rnorm(5)
rnorm(5)
RData-Example.R
# Note: "sales" is defined in Data-frames.R, so it has to be run first!
# save data frame as RData file (in the current working directory)
save(sales, file = "oursalesdata.RData")

# remove data frame "sales" from memory
rm(sales)

# Does variable "sales" exist?
exists("sales")

# Load data set  (in the current working directory):
load("oursalesdata.RData")

# Does variable "sales" exist?
exists("sales")

sales

# averages of the variables:
colMeans(sales)
Simulate-Estimate.R
# Set the random seed
set.seed(123456)

# Draw a sample given the population parameters
sample <- rnorm(100,10,2)

# Estimate the population mean with the sample average
mean(sample)

# Draw a different sample and estimate again:
sample <- rnorm(100,10,2)
mean(sample)

# Draw a third sample and estimate again:
sample <- rnorm(100,10,2)
mean(sample)
Simulation-Inference.R
# Set the random seed
set.seed(123456)

# initialize vectors to later store results:
r <- 10000
CIlower <- numeric(r); CIupper <- numeric(r)
pvalue1 <- numeric(r); pvalue2 <- numeric(r)

# repeat r times:
for(j in 1:r) {
  # Draw a sample
  sample <- rnorm(100,10,2)
  # test the (correct) null hypothesis mu=10:
  testres1 <- t.test(sample,mu=10)
  # store CI & p value:
  CIlower[j] <- testres1$conf.int[1]
  CIupper[j] <- testres1$conf.int[2]
  pvalue1[j] <- testres1$p.value
  # test the (incorrect) null hypothesis mu=9.5 & store the p value:
  pvalue2[j] <- t.test(sample,mu=9.5)$p.value
}

# Test results as logical value
reject1<-pvalue1<=0.05;  reject2<-pvalue2<=0.05
table(reject1)
table(reject2)
Simulation-Repeated-Results.R
# The first 20 of 10000 estimates:
ybar[1:20]

# Simulated mean:
mean(ybar)

# Simulated variance:
var(ybar)

# Simulated density:
plot(density(ybar))
curve( dnorm(x,10,sqrt(.04)), add=TRUE,lty=2)
Simulation-Repeated.R
# Set the random seed
set.seed(123456)

# initialize ybar to a vector of length r=10000 to later store results:
r <- 10000
ybar <- numeric(r)

# repeat r times:
for(j in 1:r) {
  # Draw a sample and store the sample mean in pos. j=1,2,... of ybar: 
  sample <- rnorm(100,10,2)
  ybar[j] <- mean(sample)
}
smpl_Bernoulli.R
rbinom(10,1,0.5)
smpl_Norm.R
rnorm(10)
strings.R
cities <- c("New York","Los Angeles","Chicago")
cities
t-test-examples1.R
# data for the scrap rates examples:
SR87<-c(10,1,6,.45,1.25,1.3,1.06,3,8.18,1.67,.98,1,.45,5.03,8,9,18,.28,
                                                                 7,3.97)
SR88<-c(3,1,5,.5,1.54,1.5,.8,2,.67,1.17,.51,.5,.61,6.7,4,7,19,.2,5,3.83)
Change <- SR88 - SR87

# Example C.2: two-sided CI
t.test(Change)
# Example C.6: 1-sided test:
t.test(Change, alternative="less")
t-test-examples2.R
require(foreign)
audit <- read.dta("http://fmwww.bc.edu/ec-p/data/wooldridge/audit.dta")

# Example C.3: two-sided CI
t.test(audit$y)
# Examples C.5 & C.7: 1-sided test:
t.test(audit$y, alternative="less")
Test-Results-List.R
data(audit, package='wooldridge')

# store test results as a list "testres"
testres <- t.test(audit$y)

# print results:
testres

# component names: which results can be accessed?
names(testres)

# p-value
testres$p.value
Var-ls.R
ls()
Variables.R
# generate variable x (no output):
x <- 5

# display x & x^2: 
x
x^2

# generate variables y&z with immediate display using ():
(y <- 3)
(z <- y^x)
Vector-Functions.R
# Define vector
(a <- c(7,2,6,9,4,1,3))

# Basic functions:
sort(a)
length(a)
min(a)
max(a)
sum(a)
prod(a)

# Creating special vectors:
numeric(20)
rep(1,20)
seq(50)
5:15
seq(4,20,2)
Vector-Indices.R
# Create a vector "avgs":
avgs <- c(.366, .358, .356, .349, .346)

# Create a string vector of names:
players <- c("Cobb","Hornsby","Jackson","O'Doul","Delahanty")

# Assign names to vector and display vector:
names(avgs) <- players
avgs

# Indices by number:
avgs[2]
avgs[1:4]

# Indices by name:
avgs["Jackson"]

# Logical indices:
avgs[ avgs>=0.35 ]
Vectors.R
# Define a with immediate output through parantheses:
(a <- c(1,2,3,4,5,6))
(b <- a+1)
(c <- a+b)
(d <- b*c)
sqrt(d)
wdi_data.R
# packages: WDI for raw data, dplyrfor manipulation
library(WDI); library(dplyr); library(ggplot)

wdi_raw <- WDI(indicator=c("SP.DYN.LE00.FE.IN"),
                    start = 1960, end = 2014)

head(wdi_raw)
tail(wdi_raw)
wdi-ctryavg-beautify.R
# Note: run wdi-ctryavg.R first to define "avgdata"!

# Order the levels meaningfully
avgdata$income <- factor( avgdata$income, 
                          levels = c("High income: OECD",
                                     "High income: nonOECD",
                                     "Upper middle income",
                                     "Lower middle income",
                                     "Low income") )
# Plot
ggplot(avgdata, aes(year, LE_avg, color=income)) +
  geom_line(size=1) +                               # thicker lines 
  scale_color_grey() +                              # gray scale
  scale_x_continuous(breaks=seq(1960,2015,10)) +    # adjust x axis breaks  
  theme_light() +                                   # light theme (white background,...)
  labs(title="Life expectancy of women",            
       subtitle="Average by country classification",
       x="Year", y="Life expectancy [Years]",
       color="Income level",
       caption="Source: World Bank, WDI")
wdi-ctryavg.R
# Note: run wdi-ctryinfo.R first to define "alldata"!

# Summarize by country and year
avgdata <- alldata %>%     
  filter(income != "Aggregates") %>%           # remove rows for aggregates
  filter(income != "Not classified") %>%       # remove unclassified ctries
  group_by(income, year) %>%                   # group by income classification
  summarize(LE_avg = mean(LE, na.rm=TRUE)) %>% # average by group 
  ungroup()                                    # remove grouping

# First 6 rows:
tail(avgdata)

# plot
ggplot(avgdata, aes(year, LE_avg, color=income)) +
  geom_line() +
  scale_color_grey()
wdi-ctryinfo.R
library(WDI); library(dplyr)

# Download raw life expectency data
le_data <- WDI(indicator=c("SP.DYN.LE00.FE.IN"), start = 1960, end = 2014) %>% 
  rename(LE = SP.DYN.LE00.FE.IN)

tail(le_data)

# Country-data on income classification
ctryinfo <- as.data.frame(WDI_data$country, stringsAsFactors = FALSE) %>% 
  select(country, income)

tail(ctryinfo)

# Join:
alldata <- left_join(le_data, ctryinfo)

tail(alldata)
wdi-data.R
# packages: WDI for raw data, dplyr for manipulation
library(WDI);

wdi_raw <- WDI(indicator=c("SP.DYN.LE00.FE.IN"), start = 1960, end = 2014)

head(wdi_raw)
tail(wdi_raw)
wdi-manipulation.R
library(dplyr)

# filter: only US data
ourdata <- filter(wdi_raw, iso2c=="US")
# rename lifeexpectancy variable
ourdata <- rename(ourdata, LE_fem=SP.DYN.LE00.FE.IN)
# select relevant variables
ourdata <- select(ourdata, year, LE_fem)
# order by year (increasing)
ourdata <- arrange(ourdata, year)

# Head and tail of data
head(ourdata)
tail(ourdata)

# Graph
library(ggplot2)
ggplot(ourdata, aes(year, LE_fem)) + 
  geom_line() +
  theme_light() +
  labs(title="Life expectancy of females in the US",
       subtitle="World Bank: World Development Indicators",
       x = "Year",
       y = "Life expectancy [years]"
       ) 
wdi-pipes.R
library(dplyr)
# All manipulations with pipes:
ourdata <- wdi_raw %>% 
  filter(iso2c=="US") %>% 
  rename(LE_fem=SP.DYN.LE00.FE.IN) %>% 
  select(year, LE_fem) %>% 
  arrange(year) 
Adv-Functions.py
# define function:
def mysqrt(x):
    if x >= 0:
        result = x ** 0.5
    else:
        result = 'You fool!'
    return result


# call function and save result:
result1 = mysqrt(4)
print(f'result1: {result1}\n')

result2 = mysqrt(-1.5)
print(f'result2: {result2}\n')
Adv-Loops.py
seq = [1, 2, 3, 4, 5, 6]
for i in seq:
    if i < 4:
        print(i ** 3)
    else:
        print(i ** 2)
Adv-Loops2.py
seq = [1, 2, 3, 4, 5, 6]
for i in range(len(seq)):
    if seq[i] < 4:
        print(seq[i] ** 3)
    else:
        print(seq[i] ** 2)
Adv-ObjOr.py
# use the predefined class 'list' to create an object:
a = [2, 6, 3, 6]

# access a local variable (to find out what kind of object we are dealing with):
check = type(a).__name__
print(f'check: {check}\n')

# make use of a method (how many 6 are in a?):
count_six = a.count(6)
print(f'count_six: {count_six}\n')

# use another method (sort data in a):
a.sort()
print(f'a: {a}\n')
Adv-ObjOr2.py
import numpy as np

# multiply these two matrices:
a = np.array([[3, 6, 1], [2, 7, 4]])
b = np.array([[1, 8, 6], [3, 5, 8], [1, 1, 2]])

# the numpy way:
result_np = a.dot(b)
print(f'result_np: \n{result_np}\n')

# or, do it yourself by defining a class:
class myMatrices:
    def __init__(self, A, B):
        self.A = A
        self.B = B

    def mult(self):
        N = self.A.shape[0]  # number of rows in A
        K = self.B.shape[1]  # number of cols in B
        out = np.empty((N, K))  # initialize output
        for i in range(N):
            for j in range(K):
                out[i, j] = sum(self.A[i, :] * self.B[:, j])
        return out


# create an object:
test = myMatrices(a, b)

# access local variables:
print(f'test.A: \n{test.A}\n')
print(f'test.B: \n{test.B}\n')

# use object method:
result_own = test.mult()
print(f'result_own: \n{result_own}\n')
Adv-ObjOr3.py
import numpy as np

# multiply these two matrices:
a = np.array([[3, 6, 1], [2, 7, 4]])
b = np.array([[1, 8, 6], [3, 5, 8], [1, 1, 2]])


# define your own class:
class myMatrices:
    def __init__(self, A, B):
        self.A = A
        self.B = B

    def mult(self):
        N = self.A.shape[0]  # number of rows in A
        K = self.B.shape[1]  # number of cols in B
        out = np.empty((N, K))  # initialize output
        for i in range(N):
            for j in range(K):
                out[i, j] = sum(self.A[i, :] * self.B[:, j])
        return out


# define a subclass:
class myMatNew(myMatrices):
    def getTotalElem(self):
        N = self.A.shape[0]  # number of rows in A
        K = self.B.shape[1]  # number of cols in B
        return N * K


# create an object of the subclass:
test = myMatNew(a, b)

# use a method of myMatrices:
result_own = test.mult()
print(f'result_own: \n{result_own}\n')

# use a method of myMatNew:
totalElem = test.getTotalElem()
print(f'totalElem: {totalElem}\n')
advancedPy.py
if condition:
    expression1
else:
    expression2

p = 0.6
if p <= 0.05:
    print("reject H0!")
else:
    print("don't reject H0!")

fruits = ["apple", "banana", "cherry"]
for x in sequence:
    [some commands]


#
a = [2,6,3,6]
type(a).__name__
a.count(6)
a.sort()
CDF-example.py
import scipy.stats as stats

# binomial CDF:
p1 = stats.binom.cdf(3, 10, 0.2)
print(f'p1: {p1}\n')

# normal CDF:
p2 = stats.norm.cdf(1.96) - stats.norm.cdf(-1.96)
print(f'p2: {p2}\n')
CDF-figure.py
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# binomial:
# support of binomial PMF:
x_binom = np.linspace(-1, 10, num=1000)

# PMF for all these values:
cdf_binom = stats.binom.cdf(x_binom, 10, 0.2)

# plot:
plt.step(x_binom, cdf_binom, linestyle='-', color='black')
plt.xlabel('x')
plt.ylabel('Fx')
plt.savefig('PyGraphs/CDF-figure-discrete.pdf')
plt.close()

# normal:
# support of normal density:
x_norm = np.linspace(-4, 4, num=1000)

# PDF for all these values:
cdf_norm = stats.norm.cdf(x_norm)

# plot:
plt.plot(x_norm, cdf_norm, linestyle='-', color='black')
plt.xlabel('x')
plt.ylabel('Fx')
plt.savefig('PyGraphs/CDF-figure-cont.pdf')
CI.py
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(123456)

# set sample size and MC simulations:
r = 10000
n = 100

# initialize arrays to later store results:
CIlower = np.empty(r)
CIupper = np.empty(r)
pvalue1 = np.empty(r)
pvalue2 = np.empty(r)

# repeat r times:
for j in range(r):
    # draw a sample:
    sample = np.random.normal(10, 2, n)
    sample_mean = np.mean(sample)
    sample_sd = np.std(sample, ddof=1)
    # test the (correct) null hypothesis mu=10:
    testres1 = stats.ttest_1samp(sample, popmean=10)
    pvalue1[j] = testres1.pvalue
    cv = stats.t.ppf(0.975, df=n - 1)
    CIlower[j] = sample_mean - cv * sample_sd / np.sqrt(n)
    CIupper[j] = sample_mean + cv * sample_sd / np.sqrt(n)
    # test the (incorrect) null hypothesis mu=9.5 & store the p value:
    testres2 = stats.ttest_1samp(sample, popmean=9.5)
    pvalue2[j] = testres2.pvalue

##################
## correct H0   ##
##################

plt.figure(figsize=(3, 5))  # set figure ratio
plt.ylim(0, 101)
plt.xlim(9, 11)
for j in range(1, 101):
    if 10 > CIlower[j] and 10 < CIupper[j]:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='grey')
    else:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='black')
plt.axvline(10, linestyle='--', color='black', linewidth=0.5)
plt.ylabel('Sample No.')
plt.savefig('PyGraphs/Simulation-Inference-Figure1.pdf')

##################
## incorrect H0 ##
##################

plt.figure(figsize=(3, 5))  # set figure ratio
plt.ylim(0, 101)
plt.xlim(9, 11)
for j in range(1, 101):
    if 9.5 > CIlower[j] and 9.5 < CIupper[j]:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='grey')
    else:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='black')
plt.axvline(9.5, linestyle='--', color='black', linewidth=0.5)
plt.ylabel('Sample No.')
plt.savefig('PyGraphs/Simulation-Inference-Figure2.pdf')
Critical-Values-t.py
import numpy as np
import pandas as pd
import scipy.stats as stats

# degrees of freedom = n-1:
df = 19

# significance levels:
alpha_one_tailed = np.array([0.1, 0.05, 0.025, 0.01, 0.005, .001])
alpha_two_tailed = alpha_one_tailed * 2

# critical values & table:
CV = stats.t.ppf(1 - alpha_one_tailed, df)
table = pd.DataFrame({'alpha_one_tailed': alpha_one_tailed,
                      'alpha_two_tailed': alpha_two_tailed, 'CV': CV})
print(f'table: \n{table}\n')
Descr-Boxplot.py
import wooldridge as woo
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# extract roe and salary:
roe = ceosal1['roe']
consprod = ceosal1['consprod']

# plotting descriptive statistics:
plt.boxplot(roe, vert=False)
plt.ylabel('roe')
plt.savefig('PyGraphs/Boxplot1.pdf')
plt.close()

# plotting descriptive statistics:
roe_cp0 = roe[consprod == 0]
roe_cp1 = roe[consprod == 1]

plt.boxplot([roe_cp0, roe_cp1])
plt.ylabel('roe')
plt.savefig('PyGraphs/Boxplot2.pdf')
Descr-ECDF.py
import wooldridge as woo
import numpy as np
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# extract roe:
roe = ceosal1['roe']

# calculate ECDF:
x = np.sort(roe)
n = x.size
y = np.arange(1, n + 1) / n  # generates cumulative shares of observations

# plot a step function:
plt.step(x, y, linestyle='-', color='black')
plt.xlabel('roe')
plt.savefig('PyGraphs/ecdf.pdf')
Descr-Figures.py
import wooldridge as woo
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

affairs = woo.dataWoo('affairs')

# attach labels (see previous script):
affairs['ratemarr'] = affairs['ratemarr'] - 1
affairs['haskids'] = pd.Categorical.from_codes(affairs['kids'],
                                               categories=['no', 'yes'])
mlab = ['very unhappy', 'unhappy', 'average', 'happy', 'very happy']
affairs['marriage'] = pd.Categorical.from_codes(affairs['ratemarr'],
                                                categories=mlab)

# counts for all graphs:
counts = affairs['marriage'].value_counts()
counts_bykids = affairs['marriage'].groupby(affairs['haskids']).value_counts()
counts_yes = counts_bykids['yes']
counts_no = counts_bykids['no']

# pie chart (a):
grey_colors = ['0.3', '0.4', '0.5', '0.6', '0.7']
plt.pie(counts, labels=mlab, colors=grey_colors)
plt.savefig('PyGraphs/Descr-Pie.pdf')
plt.close()

# horizontal bar chart (b):
y_pos = [0, 1, 2, 3, 4]  # the y locations for the bars
plt.barh(y_pos, counts, color='0.6')
plt.yticks(y_pos, mlab, rotation=60)  # add and adjust labeling
plt.savefig('PyGraphs/Descr-Bar1.pdf')
plt.close()

# stacked bar plot (c):
x_pos = [0, 1, 2, 3, 4]  # the x locations for the bars
plt.bar(x_pos, counts_yes, width=0.4, color='0.6', label='Yes')
# with 'bottom=counts_yes' bars are added on top of previous ones:
plt.bar(x_pos, counts_no, width=0.4, bottom=counts_yes, color='0.3', label='No')
plt.ylabel('Counts')
plt.xticks(x_pos, mlab)  # add labels on x axis
plt.legend()
plt.savefig('PyGraphs/Descr-Bar2.pdf')
plt.close()

# grouped bar plot (d)
# add left bars first and move bars to the left:
x_pos_leftbar = [-0.2, 0.8, 1.8, 2.8, 3.8]
plt.bar(x_pos_leftbar, counts_yes, width=0.4, color='0.6', label='Yes')
# add right bars first and move bars to the right:
x_pos_rightbar = [0.2, 1.2, 2.2, 3.2, 4.2]
plt.bar(x_pos_rightbar, counts_no, width=0.4, color='0.3', label='No')
plt.ylabel('Counts')
plt.xticks(x_pos, mlab)
plt.legend()
plt.savefig('PyGraphs/Descr-Bar3.pdf')
Descr-Stats.py
import wooldridge as woo
import numpy as np

ceosal1 = woo.dataWoo('ceosal1')

# extract roe and salary:
roe = ceosal1['roe']
salary = ceosal1['salary']

# sample average:
roe_mean = np.mean(salary)
print(f'roe_mean: {roe_mean}\n')

# sample median:
roe_med = np.median(salary)
print(f'roe_med: {roe_med}\n')

# standard deviation:
roe_s = np.std(salary, ddof=1)
print(f'roe_s: {roe_s}\n')

# correlation with ROE:
roe_corr = np.corrcoef(roe, salary)
print(f'roe_corr: \n{roe_corr}\n')
Descr-Tables.py
import wooldridge as woo
import numpy as np
import pandas as pd

affairs = woo.dataWoo('affairs')

# adjust codings to [0-4] (Categoricals require a start from 0):
affairs['ratemarr'] = affairs['ratemarr'] - 1

# use a pandas.Categorical object to attach labels for "haskids":
affairs['haskids'] = pd.Categorical.from_codes(affairs['kids'],
                                               categories=['no', 'yes'])
# ... and "marriage" (for example: 0 = 'very unhappy', 1 = 'unhappy',...):
mlab = ['very unhappy', 'unhappy', 'average', 'happy', 'very happy']
affairs['marriage'] = pd.Categorical.from_codes(affairs['ratemarr'],
                                                categories=mlab)

# frequency table in numpy (alphabetical order of elements):
ft_np = np.unique(affairs['marriage'], return_counts=True)
unique_elem_np = ft_np[0]
counts_np = ft_np[1]
print(f'unique_elem_np: \n{unique_elem_np}\n')
print(f'counts_np: \n{counts_np}\n')

# frequency table in pandas:
ft_pd = affairs['marriage'].value_counts()
print(f'ft_pd: \n{ft_pd}\n')

# frequency table with groupby:
ft_pd2 = affairs['marriage'].groupby(affairs['haskids']).value_counts()
print(f'ft_pd2: \n{ft_pd2}\n')

# contingency table in pandas:
ct_all_abs = pd.crosstab(affairs['marriage'], affairs['haskids'], margins=3)
print(f'ct_all_abs: \n{ct_all_abs}\n')
ct_all_rel = pd.crosstab(affairs['marriage'], affairs['haskids'], normalize='all')
print(f'ct_all_rel: \n{ct_all_rel}\n')

# share within "marriage" (i.e. within a row):
ct_row = pd.crosstab(affairs['marriage'], affairs['haskids'], normalize='index')
print(f'ct_row: \n{ct_row}\n')

# share within "haskids"  (i.e. within a column):
ct_col = pd.crosstab(affairs['marriage'], affairs['haskids'], normalize='columns')
print(f'ct_col: \n{ct_col}\n')
Dicts-Copy.py
# define and print a dict:
var1 = ['Florian', 'Daniel']
var2 = [96, 49]
var3 = [True, False]
example_dict = dict(name=var1, points=var2, passed=var3)
print(f'example_dict: {example_dict}\n')

# if you want to work on a copy:
import copy
copied_dict = copy.deepcopy(example_dict)
copied_dict['points'][1] = copied_dict['points'][1] - 40
print(f'example_dict: \n{example_dict}\n')
print(f'copied_dict: \n{copied_dict}\n')
Dicts.py
# define and print a dict:
var1 = ['Florian', 'Daniel']
var2 = [96, 49]
var3 = [True, False]
example_dict = dict(name=var1, points=var2, passed=var3)
print(f'example_dict: \n{example_dict}\n')

# another way to define the dict:
example_dict2 = {'name': var1, 'points': var2, 'passed': var3}
print(f'example_dict2: \n{example_dict2}\n')

# get data type:
print(f'type(example_dict): {type(example_dict)}\n')

# access 'points':
points_all = example_dict['points']
print(f'points_all: {points_all}\n')

# access 'points' of Daniel:
points_daniel = example_dict['points'][1]
print(f'points_daniel: {points_daniel}\n')

# add 4 to 'points' of Daniel and let him pass:
example_dict['points'][1] = example_dict['points'][1] + 4
example_dict['passed'][1] = True
print(f'example_dict: \n{example_dict}\n')

# add a new variable 'grade':
example_dict['grade'] = [1.3, 4.0]

# delete variable 'points':
del example_dict['points']
print(f'example_dict: \n{example_dict}\n')
Example-B-6.py
import scipy.stats as stats

# first example using the transformation:
p1_1 = stats.norm.cdf(2 / 3) - stats.norm.cdf(-2 / 3)
print(f'p1_1: {p1_1}\n')

# first example working directly with the distribution of X:
p1_2 = stats.norm.cdf(6, 4, 3) - stats.norm.cdf(2, 4, 3)
print(f'p1_2: {p1_2}\n')

# second example:
p2 = 1 - stats.norm.cdf(2, 4, 3) + stats.norm.cdf(-2, 4, 3)
print(f'p2: {p2}\n')
Example-C-2.py
import numpy as np
import scipy.stats as stats

# manually enter raw data from Wooldridge, Table C.3:
SR87 = np.array([10, 1, 6, .45, 1.25, 1.3, 1.06, 3, 8.18, 1.67,
                 .98, 1, .45, 5.03, 8, 9, 18, .28, 7, 3.97])
SR88 = np.array([3, 1, 5, .5, 1.54, 1.5, .8, 2, .67, 1.17, .51,
                 .5, .61, 6.7, 4, 7, 19, .2, 5, 3.83])

# calculate change:
Change = SR88 - SR87

# ingredients to CI formula:
avgCh = np.mean(Change)
print(f'avgCh: {avgCh}\n')

n = len(Change)
sdCh = np.std(Change, ddof=1)
se = sdCh / np.sqrt(n)
print(f'se: {se}\n')

c = stats.t.ppf(0.975, n - 1)
print(f'c: {c}\n')

# confidence interval:
lowerCI = avgCh - c * se
print(f'lowerCI: {lowerCI}\n')

upperCI = avgCh + c * se
print(f'upperCI: {upperCI}\n')
Example-C-3.py
import wooldridge as woo
import numpy as np
import scipy.stats as stats

audit = woo.dataWoo('audit')
y = audit['y']

# ingredients to CI formula:
avgy = np.mean(y)
n = len(y)
sdy = np.std(y, ddof=1)
se = sdy / np.sqrt(n)
c95 = stats.norm.ppf(0.975)
c99 = stats.norm.ppf(0.995)

# 95% confidence interval:
lowerCI95 = avgy - c95 * se
print(f'lowerCI95: {lowerCI95}\n')

upperCI95 = avgy + c95 * se
print(f'upperCI95: {upperCI95}\n')

# 99% confidence interval:
lowerCI99 = avgy - c99 * se
print(f'lowerCI99: {lowerCI99}\n')

upperCI99 = avgy + c99 * se
print(f'upperCI99: {upperCI99}\n')
Example-C-5.py
import wooldridge as woo
import numpy as np
import pandas as pd
import scipy.stats as stats

audit = woo.dataWoo('audit')
y = audit['y']

# automated calculation of t statistic for H0 (mu=0):
test_auto = stats.ttest_1samp(y, popmean=0)
t_auto = test_auto.statistic  # access test statistic
p_auto = test_auto.pvalue  # access two-sided p value
print(f't_auto: {t_auto}\n')
print(f'p_auto/2: {p_auto / 2}\n')

# manual calculation of t statistic for H0 (mu=0):
avgy = np.mean(y)
n = len(y)
sdy = np.std(y, ddof=1)
se = sdy / np.sqrt(n)
t_manual = avgy / se
print(f't_manual: {t_manual}\n')

# critical values for t distribution with n-1=240 d.f.:
alpha_one_tailed = np.array([0.1, 0.05, 0.025, 0.01, 0.005, .001])
CV = stats.t.ppf(1 - alpha_one_tailed, 240)
table = pd.DataFrame({'alpha_one_tailed': alpha_one_tailed, 'CV': CV})
print(f'table: \n{table}\n')
Example-C-6.py
import numpy as np
import scipy.stats as stats

# manually enter raw data from Wooldridge, Table C.3:
SR87 = np.array([10, 1, 6, .45, 1.25, 1.3, 1.06, 3, 8.18, 1.67,
                 .98, 1, .45, 5.03, 8, 9, 18, .28, 7, 3.97])
SR88 = np.array([3, 1, 5, .5, 1.54, 1.5, .8, 2, .67, 1.17, .51,
                 .5, .61, 6.7, 4, 7, 19, .2, 5, 3.83])
Change = SR88 - SR87

# automated calculation of t statistic for H0 (mu=0):
test_auto = stats.ttest_1samp(Change, popmean=0)
t_auto = test_auto.statistic
p_auto = test_auto.pvalue
print(f't_auto: {t_auto}\n')
print(f'p_auto/2: {p_auto / 2}\n')

# manual calculation of t statistic for H0 (mu=0):
avgCh = np.mean(Change)
n = len(Change)
sdCh = np.std(Change, ddof=1)
se = sdCh / np.sqrt(n)
t_manual = avgCh / se
print(f't_manual: {t_manual}\n')

# manual calculation of p value for H0 (mu=0):
p_manual = stats.t.cdf(t_manual, n - 1)
print(f'p_manual: {p_manual}\n')
Example-C-7.py
import wooldridge as woo
import numpy as np
import pandas as pd
import scipy.stats as stats

audit = woo.dataWoo('audit')
y = audit['y']

# automated calculation of t statistic for H0 (mu=0):
test_auto = stats.ttest_1samp(y, popmean=0)
t_auto = test_auto.statistic
p_auto = test_auto.pvalue
print(f't_auto: {t_auto}\n')
print(f'p_auto/2: {p_auto/2}\n')

# manual calculation of t statistic for H0 (mu=0):
avgy = np.mean(y)
n = len(y)
sdy = np.std(y, ddof=1)
se = sdy / np.sqrt(n)
t_manual = avgy / se
print(f't_manual: {t_manual}\n')

# manual calculation of p value for H0 (mu=0):
p_manual = stats.t.cdf(t_manual, n - 1)
print(f'p_manual: {p_manual}\n')
First-Python-Script.py
# This is a comment.
# in the next line, we try to enter Shakespeare:
'To be, or not to be: that is the question'
# let's try some sensible math:
print((1 + 2) * 5)
16 ** 0.5
print('\n')
Graphs-Basics.py
import matplotlib.pyplot as plt

# create data:
x = [1, 3, 4, 7, 8, 9]
y = [0, 3, 6, 9, 7, 8]

# plot and save:
plt.plot(x, y, color='black')
plt.savefig('PyGraphs/Graphs-Basics-a.pdf')
plt.close()
Graphs-Basics2.py
import matplotlib.pyplot as plt

# create data:
x = [1, 3, 4, 7, 8, 9]
y = [0, 3, 6, 9, 7, 8]

# plot and save:
plt.plot(x, y, color='black', linestyle='--')
plt.savefig('PyGraphs/Graphs-Basics-b.pdf')
plt.close()

plt.plot(x, y, color='black', linestyle=':')
plt.savefig('PyGraphs/Graphs-Basics-c.pdf')
plt.close()

plt.plot(x, y, color='black', linestyle='-', linewidth=3)
plt.savefig('PyGraphs/Graphs-Basics-d.pdf')
plt.close()

plt.plot(x, y, color='black', marker='o')
plt.savefig('PyGraphs/Graphs-Basics-e.pdf')
plt.close()

plt.plot(x, y, color='black', marker='v', linestyle='')
plt.savefig('PyGraphs/Graphs-Basics-f.pdf')
Graphs-BuildingBlocks.py
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# support for all normal densities:
x = np.linspace(-4, 4, num=100)
# get different density evaluations:
y1 = stats.norm.pdf(x, 0, 1)
y2 = stats.norm.pdf(x, 1, 0.5)
y3 = stats.norm.pdf(x, 0, 2)

# plot:
plt.plot(x, y1, linestyle='-', color='black', label='standard normal')
plt.plot(x, y2, linestyle='--', color='0.3', label='mu = 1, sigma = 0.5')
plt.plot(x, y3, linestyle=':', color='0.6', label='$\mu = 0$, $\sigma = 2$')
plt.xlim(-3, 4)
plt.title('Normal Densities')
plt.ylabel('$\phi(x)$')
plt.xlabel('x')
plt.legend()
plt.savefig('PyGraphs/Graphs-BuildingBlocks.pdf')
Graphs-Export.py
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# support for all normal densities:
x = np.linspace(-4, 4, num=100)

# get different density evaluations:
y1 = stats.norm.pdf(x, 0, 1)
y2 = stats.norm.pdf(x, 0, 3)

# plot (a):
plt.figure(figsize=(4, 6))
plt.plot(x, y1, linestyle='-', color='black')
plt.plot(x, y2, linestyle='--', color='0.3')
plt.savefig('PyGraphs/Graphs-Export-a.pdf')
plt.close()

# plot (b):
plt.figure(figsize=(6, 4))
plt.plot(x, y1, linestyle='-', color='black')
plt.plot(x, y2, linestyle='--', color='0.3')
plt.savefig('PyGraphs/Graphs-Export-b.png')
Graphs-Functions.py
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# support of quadratic function
# (creates an array with 100 equispaced elements from -3 to 2):
x1 = np.linspace(-3, 2, num=100)
# function values for all these values:
y1 = x1 ** 2

# plot quadratic function:
plt.plot(x1, y1, linestyle='-', color='black')
plt.savefig('PyGraphs/Graphs-Functions-a.pdf')
plt.close()

# same for normal density:
x2 = np.linspace(-4, 4, num=100)
y2 = stats.norm.pdf(x2)

# plot normal density:
plt.plot(x2, y2, linestyle='-', color='black')
plt.savefig('PyGraphs/Graphs-Functions-b.pdf')
Histogram.py
import wooldridge as woo
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# extract roe:
roe = ceosal1['roe']

# subfigure a (histogram with counts):
plt.hist(roe, color='grey')
plt.ylabel('Counts')
plt.xlabel('roe')
plt.savefig('PyGraphs/Histogram1.pdf')
plt.close()

# subfigure b (histogram with density and explicit breaks):
breaks = [0, 5, 10, 20, 30, 60]
plt.hist(roe, color='grey', bins=breaks, density=True)
plt.ylabel('density')
plt.xlabel('roe')
plt.savefig('PyGraphs/Histogram2.pdf')
Import-Export.py
import pandas as pd

# import csv with pandas:
df1 = pd.read_csv('data/sales.csv', delimiter=',', header=None,
                  names=['year', 'product1', 'product2', 'product3'])
print(f'df1: \n{df1}\n')

# import txt with pandas:
df2 = pd.read_table('data/sales.txt', delimiter=' ')
print(f'df2: \n{df2}\n')

# add a row to df1:
df3 = df1.append({'year': 2014, 'product1': 10, 'product2': 8, 'product3': 2},
                 ignore_index=True)
print(f'df3: \n{df3}\n')

# export with pandas:
df3.to_csv('data/sales2.csv')
Import-StockData.py
import pandas_datareader as pdr

# download data for 'F' (= Ford Motor Company) and define start and end:
tickers = ['F']
start_date = '2014-01-01'
end_date = '2015-12-31'

# use pandas_datareader for the import:
F_data = pdr.data.DataReader(tickers, 'yahoo', start_date, end_date)

# look at imported data:
print(f'F_data.head(): \n{F_data.head()}\n')
print(f'F_data.tail(): \n{F_data.tail()}\n')
KDensity.py
import wooldridge as woo
import statsmodels.api as sm
import matplotlib.pyplot as plt

ceosal1 = woo.dataWoo('ceosal1')

# extract roe:
roe = ceosal1['roe']

# estimate kernel density:
kde = sm.nonparametric.KDEUnivariate(roe)
kde.fit()

# subfigure a (kernel density):
plt.plot(kde.support, kde.density, color='black', linewidth=2)
plt.ylabel('density')
plt.xlabel('roe')
plt.savefig('PyGraphs/Density1.pdf')
plt.close()

# subfigure b (kernel density with overlayed histogram):
plt.hist(roe, color='grey', density=True)
plt.plot(kde.support, kde.density, color='black', linewidth=2)
plt.ylabel('density')
plt.xlabel('roe')
plt.savefig('PyGraphs/Density2.pdf')
Lists-Copy.py
# define a list:
example_list = [1, 5, 41.3, 2.0]

# be careful with changes on variables pointing on example_list:
duplicate_list = example_list
duplicate_list[3] = 10000
print(f'duplicate_list: {duplicate_list}\n')
print(f'example_list: {example_list}\n')

# work on a copy of example_list:
example_list = [1, 5, 41.3, 2.0]
duplicate_list = example_list[:]
duplicate_list[3] = 10000
print(f'duplicate_list: {duplicate_list}\n')
print(f'example_list: {example_list}\n')
Lists.py
# define a list:
example_list = [1, 5, 41.3, 2.0]
print(f'type(example_list): {type(example_list)}\n')

# access first entry by index:
first_entry = example_list[0]
print(f'first_entry: {first_entry}\n')

# access second to fourth entry by index:
range2to4 = example_list[1:4]
print(f'range2to4: {range2to4}\n')

# replace third entry by new value:
example_list[2] = 3
print(f'example_list: {example_list}\n')

# apply a function:
function_output = min(example_list)
print(f'function_output: {function_output}\n')

# apply a method:
example_list.sort()
print(f'example_list: {example_list}\n')

# delete third element of sorted list:
del example_list[2]
print(f'example_list: {example_list}\n')
MCSim-lln-clt-n.py
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
import matplotlib.pyplot as plt

##################
## LLN (normal) ##
##################

# set the random seed:
np.random.seed(123456)

# set sample sizes and MC simulations:
n = [10, 50, 100, 1000]
r = 10000

# support of normal density (fixed):
x_range = np.linspace(8.5, 11.5)

for i in n:
    ybar = np.empty(r)
    for j in range(r):
        # sample of size n
        sample = stats.norm.rvs(10, 2, size=i)
        ybar[j] = np.mean(sample)

    # simulated density:
    kde = sm.nonparametric.KDEUnivariate(ybar)
    kde.fit()

    # normal density:
    y = stats.norm.pdf(x_range, 10, 2 / np.sqrt(i))
    # plotting:
    filename = 'PyGraphs/MCSim-lln-n' + str(i) + '.pdf'
    plt.ylim(top=2)
    plt.xlim(8.5, 11.5)
    plt.plot(kde.support, kde.density, color='black', label='ybar')
    plt.plot(x_range, y, linestyle='--', color='black', label='normal distribution')
    plt.ylabel('density')
    plt.xlabel('ybar')
    plt.legend()
    plt.savefig(filename)
    plt.close()

##################
## CLT (chisq) ##
##################

# density:
x_range = np.linspace(0, 3)
y = stats.chi2.pdf(x_range, df=1)
plt.plot(x_range, y, linestyle='-', color='black')
plt.ylabel('density')
plt.savefig('PyGraphs/MCSim-chisqdens.pdf')
plt.close()

# set the random seed:
np.random.seed(123456)

# set sample sizes and MC simulations:
n = [2, 10, 100, 10000]
r = 10000


for i in n:
    ybar = np.empty(r)
    for j in range(r):
        # sample of size n
        sample = stats.chi2.rvs(1, size=i)
        ybar[j] = np.mean(sample)

    # simulated density:
    kde = sm.nonparametric.KDEUnivariate(ybar)
    kde.fit()

    # normal density:
    x_range = np.linspace(min(ybar), max(ybar))
    y = stats.norm.pdf(x_range, 1, np.sqrt(2/i))
    # plotting:
    filename = 'PyGraphs/MCSim-clt-n' + str(i) + '.pdf'
    plt.plot(kde.support, kde.density, color='black', label='ybar')
    plt.plot(x_range, y, linestyle='--', color='black', label='normal distribution')
    plt.ylabel('density')
    plt.xlabel('ybar')
    plt.legend()
    plt.savefig(filename)
    plt.close()
Module-Math.py
import math as someAlias

result1 = someAlias.sqrt(16)
print(f'result1: {result1}\n')

result2 = someAlias.pi
print(f'Pi: {result2}\n')

result3 = someAlias.e
print(f'Eulers number: {result3}\n')
Numpy-Arrays.py
import numpy as np

# define arrays in numpy:
testarray1D = np.array([1, 5, 41.3, 2.0])
print(f'type(testarray1D): {type(testarray1D)}\n')

testarray2D = np.array([[4, 9, 8, 3],
                        [2, 6, 3, 2],
                        [1, 1, 7, 4]])

# get dimensions of testarray2D:
dim = testarray2D.shape
print(f'dim: {dim}\n')

# access elements by indices:
third_elem = testarray1D[2]
print(f'third_elem: {third_elem}\n')

second_third_elem = testarray2D[1, 2]  # element in 2nd row and 3rd column
print(f'second_third_elem: {second_third_elem}\n')

second_to_third_col = testarray2D[:, 1:3]  # each row in the 2nd and 3rd column
print(f'second_to_third_col: \n{second_to_third_col}\n')

# access elements by lists:
first_third_elem = testarray1D[[0, 2]]
print(f'first_third_elem: {first_third_elem}\n')

# same with Boolean lists:
first_third_elem2 = testarray1D[[True, False, True, False]]
print(f'first_third_elem2: {first_third_elem2}\n')

k = np.array([[True, False, False, False],
              [False, False, True, False],
              [True, False, True, False]])
elem_by_index = testarray2D[k]  # 1st elem in 1st row, 3rd elem in 2nd row...
print(f'elem_by_index: {elem_by_index}\n')
Numpy-Operations.py
import numpy as np

# define an arrays in numpy:
mat1 = np.array([[4, 9, 8],
                 [2, 6, 3]])
mat2 = np.array([[1, 5, 2],
                 [6, 6, 0],
                 [4, 8, 3]])

# use a numpy function:
result1 = np.exp(mat1)
print(f'result1: \n{result1}\n')

result2 = mat1 + mat2[[0, 1]]  # same as np.add(mat1, mat2[[0, 1]])
print(f'result2: \n{result2}\n')

# use a method:
mat1_tr = mat1.transpose()
print(f'mat1_tr: \n{mat1_tr}\n')

# matrix algebra:
matprod = mat1.dot(mat2)  # same as  mat1 @ mat2
print(f'matprod: \n{matprod}\n')
Numpy-SpecialCases.py
import numpy as np

# array of integers defined by the arguments start, end and sequence length:
sequence = np.linspace(0, 2, num=11)
print(f'sequence: \n{sequence}\n')

# sequence of integers starting at 0, ending at 5-1:
sequence_int = np.arange(5)
print(f'sequence_int: \n{sequence_int}\n')

# initialize array with each element set to zero:
zero_array = np.zeros((4, 3))
print(f'zero_array: \n{zero_array}\n')

# initialize array with each element set to one:
one_array = np.ones((2, 5))
print(f'one_array: \n{one_array}\n')

# uninitialized array (filled with arbitrary nonsense elements):
empty_array = np.empty((2, 3))
print(f'empty_array: \n{empty_array}\n')
Objects-in-Python.py
result1 = 1 + 1
# determine the type:
type_result1 = type(result1)
# print the result:
print(f'type_result1: {type_result1}')

result2 = 2.5
type_result2 = type(result2)
print(f'type_result2: {type_result2}')

result3 = 'To be, or not to be: that is the question'
type_result3 = type(result3)
print(f'type_result3: {type_result3}\n')
Pandas-Operations.py
import numpy as np
import pandas as pd

# define a pandas DataFrame:
icecream_sales = np.array([30, 40, 35, 130, 120, 60])
weather_coded = np.array([0, 1, 0, 1, 1, 0])
customers = np.array([2000, 2100, 1500, 8000, 7200, 2000])
df = pd.DataFrame({'icecream_sales': icecream_sales,
                   'weather_coded': weather_coded,
                   'customers': customers})

# define and assign an index (six ends of month starting in April, 2010)
# (details on generating indices are given in Chapter 10):
ourIndex = pd.date_range(start='04/2010', freq='M', periods=6)
df.set_index(ourIndex, inplace=True)

# include sales two months ago:
df['icecream_sales_lag2'] = df['icecream_sales'].shift(2)
print(f'df: \n{df}\n')

# use a pandas.Categorical object to attach labels (0 = bad; 1 = good):
df['weather'] = pd.Categorical.from_codes(codes=df['weather_coded'],
                                          categories=['bad', 'good'])
print(f'df: \n{df}\n')

# mean sales for each weather category:
group_means = df.groupby('weather').mean()
print(f'group_means: \n{group_means}\n')
Pandas.py
import numpy as np
import pandas as pd

# define a pandas DataFrame:
icecream_sales = np.array([30, 40, 35, 130, 120, 60])
weather_coded = np.array([0, 1, 0, 1, 1, 0])
customers = np.array([2000, 2100, 1500, 8000, 7200, 2000])
df = pd.DataFrame({'icecream_sales': icecream_sales,
                   'weather_coded': weather_coded,
                   'customers': customers})

# define and assign an index (six ends of month starting in April, 2010)
# (details on generating indices are given in Chapter 10):
ourIndex = pd.date_range(start='04/2010', freq='M', periods=6)
df.set_index(ourIndex, inplace=True)

# print the DataFrame
print(f'df: \n{df}\n')

# access columns by variable names:
subset1 = df[['icecream_sales', 'customers']]
print(f'subset1: \n{subset1}\n')

# access second to fourth row:
subset2 = df[1:4]  # same as df['2010-05-31':'2010-07-31']
print(f'subset2: \n{subset2}\n')

# access rows and columns by index and variable names:
subset3 = df.loc['2010-05-31', 'customers']  # same as df.iloc[1,2]
print(f'subset3: \n{subset3}\n')

# access rows and columns by index and variable integer positions:
subset4 = df.iloc[1:4, 0:2]
# same as df.loc['2010-05-31':'2010-07-31', ['icecream_sales','weather']]
print(f'subset4: \n{subset4}\n')
PDF-example.py
import scipy.stats as stats
import numpy as np
import matplotlib.pyplot as plt

# support of normal density:
x_range = np.linspace(-4, 4, num=100)

# PDF for all these values:
pdf = stats.norm.pdf(x_range)

# plot:
plt.plot(x_range, pdf, linestyle='-', color='black')
plt.xlabel('x')
plt.ylabel('dx')
plt.savefig('PyGraphs/PDF-example.pdf')
PMF-binom.py
import scipy.stats as stats
import math

# pedestrian approach:
c = math.factorial(10) / (math.factorial(2) * math.factorial(10 - 2))
p1 = c * (0.2 ** 2) * (0.8 ** 8)
print(f'p1: {p1}\n')

# scipy function:
p2 = stats.binom.pmf(2, 10, 0.2)
print(f'p2: {p2}\n')
PMF-example.py
import scipy.stats as stats
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# values for x (all between 0 and 10):
x = np.linspace(0, 10, num=11)

# PMF for all these values:
fx = stats.binom.pmf(x, 10, 0.2)

# collect values in DataFrame:
result = pd.DataFrame({'x': x, 'fx': fx})
print(f'result: \n{result}\n')

# plot:
plt.bar(x, fx, color='0.6')
plt.ylabel('x')
plt.ylabel('fx')
plt.savefig('PyGraphs/PMF-example.pdf')
Python-as-a-Calculator.py
result1 = 1 + 1
print(f'result1: {result1}\n')

result2 = 5 * (4 - 1) ** 2
print(f'result2: {result2}\n')

result3 = [result1, result2]
print(f'result3: \n{result3}\n')
Quantile-example.py
import scipy.stats as stats

q_975 = stats.norm.ppf(0.975)
print(f'q_975: {q_975}\n')
Random-Numbers.py
import numpy as np
import scipy.stats as stats

# sample from a standard normal RV with sample size n=5:
sample1 = stats.norm.rvs(size=5)
print(f'sample1: {sample1}\n')

# a different sample from the same distribution:
sample2 = stats.norm.rvs(size=5)
print(f'sample2: {sample2}\n')

# set the seed of the random number generator and take two samples:
np.random.seed(6254137)
sample3 = stats.norm.rvs(size=5)
print(f'sample3: {sample3}\n')

sample4 = stats.norm.rvs(size=5)
print(f'sample4: {sample4}\n')

# reset the seed to the same value to get the same samples again:
np.random.seed(6254137)
sample5 = stats.norm.rvs(size=5)
print(f'sample5: {sample5}\n')

sample6 = stats.norm.rvs(size=5)
print(f'sample6: {sample6}\n')
Simulate-Estimate.py
import numpy as np
import scipy.stats as stats

# set the random seed:
np.random.seed(123456)

# set sample size:
n = 100

# draw a sample given the population parameters:
sample1 = stats.norm.rvs(10, 2, size=n)

# estimate the population mean with the sample average:
estimate1 = np.mean(sample1)
print(f'estimate1: {estimate1}\n')

# draw a different sample and estimate again:
sample2 = stats.norm.rvs(10, 2, size=n)
estimate2 = np.mean(sample2)
print(f'estimate2: {estimate2}\n')

# draw a third sample and estimate again:
sample3 = stats.norm.rvs(10, 2, size=n)
estimate3 = np.mean(sample3)
print(f'estimate3: {estimate3}\n')
Simulation-Inference-Figure.py
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(123456)

# set sample size and MC simulations:
r = 10000
n = 100

# initialize arrays to later store results:
CIlower = np.empty(r)
CIupper = np.empty(r)
pvalue1 = np.empty(r)
pvalue2 = np.empty(r)

# repeat r times:
for j in range(r):
    # draw a sample:
    sample = stats.norm.rvs(10, 2, size=n)
    sample_mean = np.mean(sample)
    sample_sd = np.std(sample, ddof=1)
    # test the (correct) null hypothesis mu=10:
    testres1 = stats.ttest_1samp(sample, popmean=10)
    pvalue1[j] = testres1.pvalue
    cv = stats.t.ppf(0.975, df=n - 1)
    CIlower[j] = sample_mean - cv * sample_sd / np.sqrt(n)
    CIupper[j] = sample_mean + cv * sample_sd / np.sqrt(n)
    # test the (incorrect) null hypothesis mu=9.5 & store the p value:
    testres2 = stats.ttest_1samp(sample, popmean=9.5)
    pvalue2[j] = testres2.pvalue

##################
## correct H0   ##
##################

plt.figure(figsize=(3, 5))  # set figure ratio
plt.ylim(0, 101)
plt.xlim(9, 11)
for j in range(1, 101):
    if 10 > CIlower[j] and 10 < CIupper[j]:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='grey')
    else:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='black')
plt.axvline(10, linestyle='--', color='black', linewidth=0.5)
plt.ylabel('Sample No.')
plt.savefig('PyGraphs/Simulation-Inference-Figure1.pdf')

##################
## incorrect H0 ##
##################

plt.figure(figsize=(3, 5))  # set figure ratio
plt.ylim(0, 101)
plt.xlim(9, 11)
for j in range(1, 101):
    if 9.5 > CIlower[j] and 9.5 < CIupper[j]:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='grey')
    else:
        plt.plot([CIlower[j], CIupper[j]], [j, j], linestyle='-', color='black')
plt.axvline(9.5, linestyle='--', color='black', linewidth=0.5)
plt.ylabel('Sample No.')
plt.savefig('PyGraphs/Simulation-Inference-Figure2.pdf')
Simulation-Inference.py
import numpy as np
import scipy.stats as stats

# set the random seed:
np.random.seed(123456)

# set sample size and MC simulations:
r = 10000
n = 100

# initialize arrays to later store results:
CIlower = np.empty(r)
CIupper = np.empty(r)
pvalue1 = np.empty(r)
pvalue2 = np.empty(r)

# repeat r times:
for j in range(r):
    # draw a sample:
    sample = stats.norm.rvs(10, 2, size=n)
    sample_mean = np.mean(sample)
    sample_sd = np.std(sample, ddof=1)

    # test the (correct) null hypothesis mu=10:
    testres1 = stats.ttest_1samp(sample, popmean=10)
    pvalue1[j] = testres1.pvalue
    cv = stats.t.ppf(0.975, df=n - 1)
    CIlower[j] = sample_mean - cv * sample_sd / np.sqrt(n)
    CIupper[j] = sample_mean + cv * sample_sd / np.sqrt(n)

    # test the (incorrect) null hypothesis mu=9.5 & store the p value:
    testres2 = stats.ttest_1samp(sample, popmean=9.5)
    pvalue2[j] = testres2.pvalue

# test results as logical value:
reject1 = pvalue1 <= 0.05
count1_true = np.count_nonzero(reject1)  # counts true
count1_false = r - count1_true
print(f'count1_true: {count1_true}\n')
print(f'count1_false: {count1_false}\n')

reject2 = pvalue2 <= 0.05
count2_true = np.count_nonzero(reject2)
count2_false = r - count2_true
print(f'count2_true: {count2_true}\n')
print(f'count2_false: {count2_false}\n')
Simulation-Repeated-Results.py
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
import matplotlib.pyplot as plt

# set the random seed:
np.random.seed(123456)

# set sample size:
n = 100

# initialize ybar to an array of length r=10000 to later store results:
r = 10000
ybar = np.empty(r)

# repeat r times:
for j in range(r):
    # draw a sample and store the sample mean in pos. j=0,1,... of ybar:
    sample = stats.norm.rvs(10, 2, size=n)
    ybar[j] = np.mean(sample)

# the first 20 of 10000 estimates:
print(f'ybar[0:19]: \n{ybar[0:19]}\n')

# simulated mean:
print(f'np.mean(ybar): {np.mean(ybar)}\n')

# simulated variance:
print(f'np.var(ybar, ddof=1): {np.var(ybar, ddof=1)}\n')

# simulated density:
kde = sm.nonparametric.KDEUnivariate(ybar)
kde.fit()

# normal density:
x_range = np.linspace(9, 11)
y = stats.norm.pdf(x_range, 10, np.sqrt(0.04))

# create graph:
plt.plot(kde.support, kde.density, color='black', label='ybar')
plt.plot(x_range, y, linestyle='--', color='black', label='normal distribution')
plt.ylabel('density')
plt.xlabel('ybar')
plt.legend()
plt.savefig('PyGraphs/Simulation-Repeated-Results.pdf')
Simulation-Repeated.py
import numpy as np
import scipy.stats as stats

# set the random seed:
np.random.seed(123456)

# set sample size:
n = 100

# initialize ybar to an array of length r=10000 to later store results:
r = 10000
ybar = np.empty(r)

# repeat r times:
for j in range(r):
    # draw a sample and store the sample mean in pos. j=0,1,... of ybar:
    sample = stats.norm.rvs(10, 2, size=n)
    ybar[j] = np.mean(sample)
smpl-bernoulli.py
import scipy.stats as stats

sample = stats.bernoulli.rvs(0.5, size=10)
print(f'sample: {sample}\n')
smpl-norm.py
import scipy.stats as stats

sample = stats.norm.rvs(size=10)
print(f'sample: {sample}\n')
Wooldridge.py
import wooldridge as woo

# load data:
wage1 = woo.dataWoo('wage1')

# get type:
print(f'type(wage1): \n{type(wage1)}\n')

# get an overview:
print(f'wage1.head(): \n{wage1.head()}\n')
Adv-Functions-MultArg.jl
# define function (only positional arguments):
function mysqrt_pos(x, add)
    if x >= 0
        result = x^0.5 + add
    else
        result = "You fool!"
    end
    return result
end

# define function ("x" as positional and "add" as keyword argument):
function mysqrt_kwd(x; add)
    if x >= 0
        result = x^0.5 + add
    else
        result = "You fool!"
    end
    return result
end

# call functions:
result1 = mysqrt_pos(4, 3)
println("result1 = $result1")
# mysqrt_pos(4, add = 3) is not valid

result2 = mysqrt_kwd(4, add=3)
println("result2 = $result2")
# mysqrt_kwd(4, 3) is not valid
Adv-Functions.jl
# define function:
function mysqrt(x)
    if x >= 0
        result = x^0.5
    else
        result = "You fool!"
    end
    return result
end

# call function and save result:
result1 = mysqrt(4)
println("result1 = $result1\n")

result2 = mysqrt(-1.5)
println("result2 = $result2")
Adv-Loops.jl
seq = [1, 2, 3, 4, 5, 6]
for i in seq
    if i < 4
        println(i^3)
    else
        println(i^2)
    end
end
Adv-Loops2.jl
seq = [1, 2, 3, 4, 5, 6]

for i in eachindex(seq)
    if seq[i] < 4
        println(seq[i]^3)
    else
        println(seq[i]^2)
    end
end
Adv-Performance-Jl-Figure.jl
using Random, DataFrames, Distributions, Plots, BenchmarkTools, CSV
# set the random seed:
Random.seed!(12345)

function simMean(n, reps)
    ybar = zeros(reps)
    for j in 1:reps
        # sample from normal distribution of size n
        sample = rand(Normal(), n)
        ybar[j] = mean(sample)
    end
    return ybar
end

# call the function multiple times and measure each time:
n = 100
r = 10000
step_length = 100
reps = range(100, r, step=100)
runTimeJl = zeros(Int(r / step_length))

for i in eachindex(reps)
    t = @benchmark simMean($n, $reps[$i])
    runTimeJl[i] = mean(t).time / 1e9 # mean(t).time in nanoseconds 
end

runtimeDf = DataFrame(runtime=runTimeJl)
CSV.write("Jlprocessed/01/Adv-Performance-Jl.csv", runtimeDf)

# import results (all in seconds):
R = CSV.read("Jlprocessed/01/Adv-Performance-R.csv", DataFrame)
Py = CSV.read("Jlprocessed/01/Adv-Performance-Py.csv", DataFrame)
Jl = CSV.read("Jlprocessed/01/Adv-Performance-Jl.csv", DataFrame)

# plot results
x_range = collect(reps)
plot(x_range, Jl.runtime, linestyle=:solid, color=:black,
    label="Julia", legend=:topleft)
plot!(x_range, R.runtime, linestyle=:dash, color=:black, label="R")
plot!(x_range, Py.runtime, linestyle=:dot, color=:black, label="Python")
xlabel!("Repetitions")
ylabel!("Runtime [seconds]")
savefig("JlGraphs/Adv-CompSpeed.pdf")


### Python-Code

# import random
# import numpy as np
# import timeit
# import pandas as pd
# import scipy.stats as stats

# np.random.seed(12345)

# def simMean(n, reps):
#     ybar = np.empty(reps)
#     for j in range(1, reps):
#         sample = stats.norm.rvs(0,1,size=n)
#         ybar[j] = np.mean(sample)
#     return(ybar)

# # call the function multiple times and measure each time:
# n = 100
# r = 10000
# step_length = 100
# reps = range(100, r+1, step_length)
# runTime = np.empty(len(reps))
# number = 100

# for i in range(len(reps)):
#     runTime[i] = timeit.repeat(lambda: simMean(
#         n, reps[i]), number=100, repeat=1)[0] / number
# # repeat gives total time, i.e. number * single iteration time

# dfruntime = pd.DataFrame({"runtime": runTime})
# dfruntime.to_csv("Jlprocessed/01/Adv-Performance-Py.csv")


### R-Code

# library(microbenchmark)
# library(rio)

# set.seed(12345)

# simMean <- function(n, reps){
#   ybar <- numeric(reps)
#   for(j in 1:reps){
#     sample <- rnorm(n)
#     ybar[j] <- mean(sample)
#   }
#   return(ybar)
# }

# # call the function multiple times and measure each time:
# n <- 100
# r <- 10000
# step_length <- 100
# reps <- seq(100, r, by=100)
# runtime <- numeric(R/step_length)


# for (i in 1:length(reps)){
#   t <- microbenchmark( simMean(n,reps[i]), unit = "s")
#   runtime[i] <- mean(t$time) / 1e9 # t$time in nanoseconds
# }

# export(as.data.frame(runtime),file="Jlprocessed/01/Adv-Performance-R.csv")
Adv-Performance-Py.py
import random
import numpy as np
import timeit
import pandas as pd
import scipy.stats as stats

np.random.seed(12345)


def simMean(n, reps):
    ybar = np.empty(reps)
    for j in range(1, reps):
        sample = stats.norm.rvs(0, 1, size=n)
        ybar[j] = np.mean(sample)
    return (ybar)


# call the function multiple times and measure each time:
n = 100
r = 10000
step_length = 100
reps = range(100, r+1, step_length)
runTime = np.empty(len(reps))
number = 100

for i in range(len(reps)):
    runTime[i] = timeit.repeat(lambda: simMean(
        n, reps[i]), number=100, repeat=1)[0] / number
# repeat gives total time, i.e. number * single iteration time

dfruntime = pd.DataFrame({"runtime": runTime})
dfruntime.to_csv("Jlprocessed/01/Adv-Performance-Py.csv")
Adv-Performance-R.R
library(microbenchmark)
library(rio)

set.seed(12345)

simMean <- function(n, reps) {
  ybar <- numeric(reps)
  for (j in 1:reps) {
    sample <- rnorm(n)
    ybar[j] <- mean(sample)
  }
  return(ybar)
}

# call the function multiple times and measure each time:
n <- 100
r <- 10000
step_length <- 100
reps <- seq(100, r, by = 100)
runtime <- numeric(r / step_length)


for (i in 1:length(reps)) {
  t <- microbenchmark(simMean(n, reps[i]), unit = "s")
  runtime[i] <- mean(t$time) / 1e9 # t$time in nanoseconds
}

export(as.data.frame(runtime), file = "Jlprocessed/01/Adv-Performance-R.csv")
Adv-Performance.jl
using Random, Distributions
# set the random seed:
Random.seed!(12345)

function simMean(n, reps)
    ybar = zeros(reps)
    for j in 1:reps
        # sample from normal distribution of size n
        sample = rand(Normal(), n)
        ybar[j] = mean(sample)
    end
    return ybar
end

# call the function and measure time:
n = 100
reps = 10000
stats = @timed simMean(n, reps);
runTime = stats.time
println("runTime = $runTime")
Array-Copy.jl
# define arrays:
testarray = [1, 5, 41.3, 2.0]

# be careful with changes on variables pointing on testarray:
duplicate_array = testarray
duplicate_array[3] = 10000
println("duplicate_array: $duplicate_array\n")
println("testarray: $testarray\n")

# work on a copy of example_list:
testarray = [1, 5, 41.3, 2.0]
duplicate_array = deepcopy(testarray)
duplicate_array[3] = 10000
println("duplicate_array: $duplicate_array\n")
println("testarray: $testarray")
Array-Functions.jl
# define arrays:
vec1 = [1, 4, 64, 36]
mat1 = [4 9 8 3
    2 6 3 2
    1 1 7 4]

# apply some functions:
sort!(vec1)
println("vec1: $vec1\n")

vec2 = sqrt.(vec1)
vec3 = vec1 .+ vec2
println("vec3: $vec3\n")

# get dimensions of mat1:
dim_mat1 = size(mat1)
println("dim_mat1: $dim_mat1")
Array-SpecialCases.jl
# initialize matrix with each element set to zero:
zero_mat = zeros(4, 3)
println("zero_mat: \n$zero_mat\n")

# initialize matrix with each element set to one:
one_mat = ones(2, 5)
println("one_mat: \n$one_mat\n")

# uninitialized matrix (filled with arbitrary nonsense elements):
empty_mat = Array{Float64}(undef, 2, 2)
println("empty_mat: \n$empty_mat")
Arrays.jl
# define arrays:
testarray1D = [1, 5, 41.3, 2.0]
println("type(testarray1D): $(typeof(testarray1D))\n")

testarray2D = [4 9 8 3
    2 6 3 2
    1 1 7 4]

# same as:
#testarray2D = [4 9 8 3; 2 6 3 2; 1 1 7 4]
#testarray2D = [[4 9 8 3]
#   [ 2 6 3 2]
#   [ 1 1 7 4]]
#testarray2D = [[4, 2, 1] [9, 6, 1] [8, 3, 7] [3, 2, 4]]

# get dimensions of testarray2D:
dim = size(testarray2D)
println("dim: $dim\n")

# access elements by indices:
third_elem = testarray1D[3]
println("third_elem = $third_elem\n")

second_third_elem = testarray2D[2, 3] # element in 2nd row and 3rd column
println("second_third_elem = $second_third_elem\n")

second_to_third_col = testarray2D[:, 2:3] # each row in the 2nd and 3rd column
println("second_to_third_col = $second_to_third_col\n")

last_col = testarray2D[:, end] # each row in the last column
println("last_col = $last_col\n")

# access elements by array:
first_third_elem = testarray1D[[1, 3]]
println("first_third_elem: $first_third_elem\n")

# same with Boolean array:
first_third_elem2 = testarray1D[[true, false, true, false]]
println("first_third_elem2 = $first_third_elem2\n")

k = [[true false false false]
    [false false true false]
    [true false true false]]
elem_by_index = testarray2D[k] # 1st elem in 1st row, 1st elem in 3rd row...
print("elem_by_index: $elem_by_index")
CDF-example.jl
using Distributions

# binomial CDF:
p1 = cdf(Binomial(10, 0.2), 3)
println("p1 = $p1\n")

# normal CDF:
p2 = cdf(Normal(), 1.96) - cdf(Normal(), -1.96)
println("p2 = $p2")
CDF-figure.jl
using Distributions, Plots

# binomial CDF:
x_binom = range(-1, 10, length=100)
cdf_binom = cdf.(Binomial(10, 0.2), x_binom)

plot(x_binom, cdf_binom, linetype=:steppre, color=:black, legend=false)
xlabel!("x")
ylabel!("Fx")
savefig("JlGraphs/CDF-figure-discrete.pdf")

# normal CDF:
x_norm = range(-4, 4, length=1000)
cdf_norm = cdf.(Normal(), x_norm)

plot(x_norm, cdf_norm, color=:black, legend=false)
xlabel!("x")
ylabel!("Fx")
savefig("JlGraphs/CDF-figure-cont.pdf")
Critical-Values-t.jl
using Distributions, DataFrames

# degrees of freedom = n-1:
df = 19

# significance levels:
alpha_one_tailed = [0.1, 0.05, 0.025, 0.01, 0.005, 0.001]
alpha_two_tailed = alpha_one_tailed * 2

# critical values & table:
CV = quantile.(TDist(df), 1 .- alpha_one_tailed)
table = DataFrame(alpha_one_tailed=alpha_one_tailed,
    alpha_two_tailed=alpha_two_tailed,
    CV=CV)
println("table: \n$table")
DataFrames-Functions.jl
using DataFrames, CategoricalArrays, Statistics

# define a DataFrame:
icecream_sales = [30, 40, 35, 130, 120, 60]
weather_coded = [0, 1, 0, 1, 1, 0]
customers = [2000, 2100, 1500, 8000, 7200, 2000]
df = DataFrame(
    icecream_sales=icecream_sales,
    weather_coded=weather_coded,
    customers=customers
)

# get some descriptive statistics:
descr_stats = describe(df)
println("descr_stats: \n$descr_stats\n")

# add one observation at the end in-place:
push!(df, [50, 1, 3000])
println("df: \n$df\n")

# extract observations with more than 2500 customers:
subset_df = subset(df, :customers => ByRow(>(2500)))
println("subset_df: \n$subset_df\n")

# use a CategoricalArray object to attach labels (0 = bad; 1 = good):
df.weather = recode(df[!, :weather_coded], 0 => "bad", 1 => "good")
println("df \n$df\n")

# mean sales for each weather category by
# grouping and splitting data:
grouped_data = groupby(df, :weather)
# apply the mean to icecream_sales and combine the results:
group_means = combine(grouped_data, :icecream_sales => mean)
println("group_means: \n$group_means")
DataFrames.jl
using DataFrames

# define a DataFrame:
icecream_sales = [30, 40, 35, 130, 120, 60]
weather_coded = [0, 1, 0, 1, 1, 0]
customers = [2000, 2100, 1500, 8000, 7200, 2000]
df = DataFrame(
    icecream_sales=icecream_sales,
    weather_coded=weather_coded,
    customers=customers
)

# print the DataFrame
println("df: \n$df\n")

# access columns by variable reference:
subset1 = df[!, [:icecream_sales, :customers]]
println("subset1: \n$subset1\n")

# access second to fourth row:
subset2 = df[2:4, :]
println("subset2: \n$subset2\n")

# access rows and columns by variable integer positions:
subset3 = df[2:4, 1:2]
println("subset3: \n$subset3\n")

# access rows by variable integer positions:
subset4 = df[2:4, [:icecream_sales, :weather_coded]]
println("subset4: \n$subset4")
Descr-Boxplot.jl
using WooldridgeDatasets, DataFrames, StatsPlots

ceosal1 = DataFrame(wooldridge("ceosal1"))
# extract roe and salary:
roe = ceosal1.roe
consprod = ceosal1.consprod

# plotting descriptive statistics:
boxplot(roe, orientation=:h,
    linecolor=:black, color=:white, legend=false)
yticks!([1], [""])
ylabel!("roe")
savefig("JlGraphs/Boxplot1.pdf")

# plotting descriptive statistics (logical indexing):
roe_cp0 = roe[consprod.==0]
roe_cp1 = roe[consprod.==1]
boxplot([roe_cp0, roe_cp1], linecolor=:black,
    color=:white, legend=false)
xticks!([1, 2], ["consprod=0", "consprod=1"])
ylabel!("roe")
savefig("JlGraphs/Boxplot2.pdf")
Descr-ECDF.jl
using WooldridgeDatasets, DataFrames, Plots

ceosal1 = DataFrame(wooldridge("ceosal1"))

# extract roe:
roe = ceosal1.roe

# calculate ECDF:
x = sort(roe)
n = length(x)
y = range(start=1, stop=n) / n

# plot a step function:
plot(x, y, linetype=:steppre, color=:black, legend=false)
xlabel!("roe")
savefig("JlGraphs/ecdf.pdf")
Descr-Figures.jl
using WooldridgeDatasets, DataFrames, Plots, StatsPlots,
    FreqTables, CategoricalArrays

affairs = DataFrame(wooldridge("affairs"))

# attach labels to kids and convert it to a categorical variable:
affairs.haskids = categorical(
    recode(affairs.kids, 0 => "no", 1 => "yes")
)

# ... and ratemarr (for example: 1 = "very unhappy", 2 = "unhappy",...):
affairs.marriage = categorical(
    recode(affairs.ratemarr,
        1 => "very unhappy",
        2 => "unhappy",
        3 => "average",
        4 => "happy",
        5 => "very happy"
    )
)

# counts for all graphs:
counts_m = sort(freqtable(affairs.marriage), rev=true)
levels_counts_m = String.(collect(keys(counts_m.dicts[1])))
colors_m = [:grey60, :grey50, :grey40, :grey30, :grey20]

ct_all_abs = freqtable(affairs.marriage, affairs.haskids)
levels_counts_all = String.(collect(keys(ct_all_abs.dicts[1])))
colors_all = [:grey80 :grey50]

# pie chart (a):
pie(levels_counts_m, counts_m, color=colors_m)
savefig("JlGraphs/Descr-Pie.pdf")

# bar chart (b):
bar(levels_counts_m, counts_m, color=:grey, legend=false)
savefig("JlGraphs/Descr-Bar1.pdf")

# stacked bar plot (c):
groupedbar(ct_all_abs, bar_position=:stack,
    color=colors_all, label=["no" "yes"])
xticks!(1:5, levels_counts_all)
savefig("JlGraphs/Descr-Bar2.pdf")

# grouped bar plot (d):
groupedbar(ct_all_abs, bar_position=:dodge,
    color=colors_all, label=["no" "yes"])
xticks!(1:5, levels_counts_all)
savefig("JlGraphs/Descr-Bar3.pdf")
Descr-Stats.jl
using WooldridgeDatasets, DataFrames, Statistics

ceosal1 = DataFrame(wooldridge("ceosal1"))

# extract roe and salary:
roe = ceosal1.roe
salary = ceosal1.salary

# sample average:
roe_mean = mean(roe)
println("roe_mean = $roe_mean\n")

# sample median:
roe_med = median(roe)
println("roe_med = $roe_med\n")

# corrected standard deviation (n-1 scaling):
roe_std = std(roe)
println("roe_st = $roe_std\n")

# correlation with roe:
roe_corr = cor(roe, salary)
println("roe_corr = $roe_corr\n")

# correlation matrix with roe:
roe_corr_mat = cor(hcat(roe, salary))
println("roe_corr_mat: \n$roe_corr_mat")
Descr-Tables.jl
using WooldridgeDatasets, DataFrames, CategoricalArrays, FreqTables

affairs = DataFrame(wooldridge("affairs"))

# attach labels to kids and convert it to a categorical variable:
affairs.haskids = categorical(
    recode(affairs.kids, 0 => "no", 1 => "yes")
)

# ... and ratemarr (for example: 1 = "very unhappy", 2 = "unhappy",...):
affairs.marriage = categorical(
    recode(affairs.ratemarr,
        1 => "very unhappy",
        2 => "unhappy",
        3 => "average",
        4 => "happy",
        5 => "very happy"
    )
)

# frequency table (alphabetical order of elements):
ft_marriage = freqtable(affairs.marriage)
println("ft_marriage: \n$ft_marriage\n")

# frequency table with groupby:
ft_groupby = combine(
    groupby(affairs, :haskids),
    nrow)
println("ft_groupby: \n$ft_groupby\n")

# contingency tables with absolute and relative values:
ct_all_abs = freqtable(affairs.marriage, affairs.haskids)
println("ct_all_abs: \n$ct_all_abs\n")

ct_all_rel = proptable(affairs.marriage, affairs.haskids)
println("ct_all_rel: \n$ct_all_rel\n")

# share within "marriage" (i.e. within a row):
ct_row = proptable(affairs.marriage, affairs.haskids, margins=1)
println("ct_row: \n$ct_row\n")

# share within "haskids" (i.e. within a column):
ct_col = proptable(affairs.marriage, affairs.haskids, margins=2)
println("ct_col: \n$ct_col")
Dicts.jl
# define and print a dict:
var1 = ["Florian", "Daniel"]
var2 = [96, 49]
example_dict = Dict("name" => var1, "points" => var2)
println("example_dict: \n$example_dict\n")

# get data type:
type_example_dict = typeof(example_dict)
println("type_example_dict: $type_example_dict\n")

# access "points":
points_all = example_dict["points"]
println("points_all: $points_all\n")

# access "points" of Daniel:
points_daniel = example_dict["points"][2]
println("points_daniel: $points_daniel\n")

# add 4 to "points" of Daniel:
example_dict["points"][2] = example_dict["points"][2] + 4
println("example_dict: \n$example_dict\n")

# add a new component "grade":
example_dict["grade"] = [1.3, 4.0]

# delete component "points":
delete!(example_dict, "points")
print("example_dict: \n$example_dict\n")
Example-B-6.jl
using Distributions

# first example using the transformation:
p1_1 = cdf(Normal(), 2 / 3) - cdf(Normal(), -2 / 3)
println("p1_1 = $p1_1\n")

# first example working directly with the distribution of X:
p1_2 = cdf(Normal(4, 3), 6) - cdf(Normal(4, 3), 2)
println("p1_2 = $p1_2\n")

# second example:
p2 = 1 - cdf(Normal(4, 3), 2) + cdf(Normal(4, 3), -2)
println("p2 = $p2")
Example-C-2.jl
using Distributions

# manually enter raw data from Wooldridge, Table C.3:
SR87 = [10, 1, 6, 0.45, 1.25, 1.3, 1.06, 3, 8.18, 1.67,
    0.98, 1, 0.45, 5.03, 8, 9, 18, 0.28, 7, 3.97]
SR88 = [3, 1, 5, 0.5, 1.54, 1.5, 0.8, 2, 0.67, 1.17, 0.51,
    0.5, 0.61, 6.7, 4, 7, 19, 0.2, 5, 3.83]

# calculate change:
Change = SR88 .- SR87

# ingredients to CI formula:
avgCh = mean(Change)
println("avgCh = $avgCh\n")

n = length(Change)
sdCh = std(Change)
se = sdCh / sqrt(n)
println("se = $se\n")

c = quantile(TDist(n - 1), 0.975)
println("c = $c\n")

# confidence interval:
lowerCI = avgCh - c * se
println("lowerCI = $lowerCI\n")
upperCI = avgCh + c * se
println("upperCI = $upperCI")
Example-C-3.jl
using WooldridgeDatasets, DataFrames, Distributions

audit = DataFrame(wooldridge("audit"))
y = audit.y

# ingredients to CI formula:
avgy = mean(y)
n = length(y)
sdy = std(y)
se = sdy / sqrt(n)
c95 = quantile(Normal(), 0.975)
c99 = quantile(Normal(), 0.995)

# 95% confidence interval:
lowerCI95 = avgy - c95 * se
println("lowerCI95 = $lowerCI95\n")

upperCI95 = avgy + c95 * se
println("upperCI95 = $upperCI95\n")

# 99% confidence interval:
lowerCI99 = avgy - c99 * se
println("lowerCI99 = $lowerCI99\n")

upperCI99 = avgy + c99 * se
println("upperCI99 = $upperCI99")
Example-C-5.jl
using WooldridgeDatasets, DataFrames, Distributions, HypothesisTests

audit = DataFrame(wooldridge("audit"))
y = audit.y

# automated calculation of t statistic for H0 (mu=0):
test_auto = OneSampleTTest(y, 0)
t_auto = test_auto.t # access test statistic
p_auto = pvalue(test_auto, tail=:left)  # access one-sided p value
println("t_auto = $t_auto\n")
println("p_auto = $p_auto\n")

# manual calculation of t statistic for H0 (mu=0):
avgy = mean(y)
n = length(y)
sdy = std(y)
se = sdy / sqrt(n)
t_manual = avgy / se
println("t_manual = $t_manual\n")

# critical values for t distribution with n-1=240 d.f.:
alpha_one_tailed = [0.1, 0.05, 0.025, 0.01, 0.005, 0.001]
CV = quantile(TDist(n - 1), 1 .- alpha_one_tailed)
table = DataFrame(alpha_one_tailed=alpha_one_tailed, CV=CV)
println("table: \n$table")
Example-C-6.jl
using Distributions, HypothesisTests

# manually enter raw data from Wooldridge, Table C.3:
SR87 = [10, 1, 6, 0.45, 1.25, 1.3, 1.06, 3, 8.18, 1.67,
        0.98, 1, 0.45, 5.03, 8, 9, 18, 0.28, 7, 3.97]
SR88 = [3, 1, 5, 0.5, 1.54, 1.5, 0.8, 2, 0.67, 1.17, 0.51,
        0.5, 0.61, 6.7, 4, 7, 19, 0.2, 5, 3.83]
Change = SR88 .- SR87

# automated calculation of t statistic for H0 (mu=0):
test_auto = OneSampleTTest(Change, 0)
t_auto = test_auto.t
p_auto = pvalue(test_auto, tail=:left)
println("t_auto = $t_auto\n")
println("p_auto = $p_auto\n")

# manual calculation of t statistic for H0 (mu=0):
avgCh = mean(Change)
n = length(Change)
sdCh = std(Change)
se = sdCh / sqrt(n)
t_manual = avgCh / se
println("t_manual = $t_manual\n")

# manual calculation of p value for H0 (mu=0):
p_manual = cdf(TDist(n - 1), t_manual)
println("p_manual = $p_manual")
Example-C-7.jl
using WooldridgeDatasets, DataFrames, Distributions, HypothesisTests

audit = DataFrame(wooldridge("audit"))
y = audit.y

# automated calculation of t statistic for H0 (mu=0):
test_auto = OneSampleTTest(y, 0)
t_auto = test_auto.t
p_auto = pvalue(test_auto, tail=:left)
println("t_auto = $t_auto\n")
println("p_auto = $p_auto\n")

# manual calculation of t statistic for H0 (mu=0):
avgy = mean(y)
n = length(y)
sdy = std(y)
se = sdy / sqrt(n)
t_manual = avgy / se
println("t_manual = $t_manual\n")

# manual calculation of p value for H0 (mu=0):
p_manual = cdf(TDist(n - 1), t_manual)
println("p_manual = $p_manual")
First-Julia-Script.jl
# This is a comment.
# in the next line, we try to enter Shakespeare:
"To be, or not to be: that is the question"
# let’s try some sensible math:
sqrt(16)
print((1 + 2) * 5)
Graphs-Basics.jl
using Plots

# create data:
x = [1, 3, 4, 7, 8, 9]
y = [0, 3, 6, 9, 7, 8]

# plot and save:
plot(x, y, color=:black)
savefig("JlGraphs/Graphs-Basics-a.pdf")

# scatter and save:
scatter(x, y, color=:black, markershape=:dtriangle, legend=false)
savefig("JlGraphs/Graphs-Basics-b.pdf")
Graphs-Basics2.jl
using Plots

# create data:
x = [1, 3, 4, 7, 8, 9]
y = [0, 3, 6, 9, 7, 8]

# plot and save:
plot(x, y, color=:black, linestyle=:dash, legend=false)
savefig("JlGraphs/Graphs-Basics-c.pdf")

plot(x, y, color=:black, linestyle=:dot, legend=false)
savefig("JlGraphs/Graphs-Basics-d.pdf")

plot(x, y, color=:black, linestyle=:solid, linewidth=3, legend=false)
savefig("JlGraphs/Graphs-Basics-e.pdf")

plot(x, y, color=:black, markershape=:circle, legend=false)
savefig("JlGraphs/Graphs-Basics-f.pdf")
Graphs-BuildingBlocks.jl
using Plots, Distributions

# support for all normal densities:
x = range(-4, 4, length=100)
# get different density evaluations:
y1 = pdf.(Normal(), x)
y2 = pdf.(Normal(1, 0.5), x)
y3 = pdf.(Normal(0, 2), x)

# plot:
plot(x, y1, linestyle=:solid, color=:black, label="standard normal")
plot!(x, y2, linestyle=:dash, color=:black,
    linealpha=0.6, label="mu = 1, sigma = 0.5")
plot!(x, y3, linestyle=:dot, color=:black,
    linealpha=0.3, label="mu = 0, sigma = 2")
xlims!(-3, 4)
title!("Normal Densities")
ylabel!("phi(x)")
xlabel!("x")
savefig("JlGraphs/Graphs-BuildingBlocks.pdf")
Graphs-Export.jl
using Plots, Distributions

# support for all normal densities:
x = range(-4, 4, length=100)
# get different density evaluations:
y1 = pdf.(Normal(), x)
y2 = pdf.(Normal(0, 3), x)

# plot (a):
plot(legend=false, size=(400, 600))
plot!(x, y1, linestyle=:solid, color=:black)
plot!(x, y2, linestyle=:dash, color=:black, linealpha=0.3)
savefig("JlGraphs/Graphs-Export-a.pdf")

# plot (b):
plot(legend=false, size=(600, 400))
plot!(x, y1, linestyle=:solid, color=:black)
plot!(x, y2, linestyle=:dash, color=:black, linealpha=0.3)
savefig("JlGraphs/Graphs-Export-b.png")
Graphs-Functions.jl
using Plots, Distributions

# support of quadratic function
# (creates an array with 100 equispaced elements from -3 to 2):
x1 = range(start=-3, stop=2, length=100)
# function values for all these values:
y1 = x1 .^ 2

# plot quadratic function:
plot(x1, y1, linestyle=:solid, color=:black, legend=false)
savefig("JlGraphs/Graphs-Functions-a.pdf")

# same for normal density:
x2 = range(-4, 4, length=100)
y2 = pdf.(Normal(), x2)

# plot normal density:
plot(x2, y2, linestyle=:solid, color=:black, legend=false)
savefig("JlGraphs/Graphs-Functions-b.pdf")
Histogram.jl
using WooldridgeDatasets, Plots, DataFrames

ceosal1 = DataFrame(wooldridge("ceosal1"))

# extract roe:
roe = ceosal1.roe

# histogram with counts (a):
histogram(roe, color=:grey, legend=false)
ylabel!("Counts")
xlabel!("roe")
savefig("JlGraphs/Histogram1.pdf")

# histogram with density and explicit breaks (b):
breaks = [0, 5, 10, 20, 30, 60]
histogram(roe, color=:grey,
    bins=breaks,
    normalize=true,
    legend=false)
xlabel!("roe")
ylabel!("Density")
savefig("JlGraphs/Histogram2.pdf")
Import-Export.jl
using DataFrames, CSV

# import a .CSV file with CSV.read:
df1 = CSV.read("data/sales.csv", DataFrame, delim=",",
    header=["year", "product1", "product2", "product3"])
println("df1: \n$df1\n")

# import a .txt file with CSV.read:
df2 = CSV.read("data/sales.txt", DataFrame, delim=" ")
println("df2: \n$df2\n")

# add a row to df1:
push!(df1, [2014, 10, 8, 2])
println("df1: \n$df1")

# export with CSV.write:
CSV.write("data/sales2.csv", df1)
Import-StockData.jl
using DataFrames, Dates, MarketData

# download data for "F" (= Ford) and define start and end:
ticker = "F"
start_date = DateTime(2007, 12, 31)
end_date = DateTime(2017, 01, 01)

# import data as DataFrame:
F_data = DataFrame(yahoo(ticker,
    YahooOpt(period1=start_date, period2=end_date)))

preview_F_data = first(F_data, 5)
println("preview_F_data: \n$preview_F_data")
Julia-as-a-Calculator.jl
result1 = 1 + 1
print("result1 = $result1\n")

result2 = 5 * (4 - 1)^2
println("result2 = $result2")

result3 = [result1, result2]
print("result3:\n $result3")
KDensity.jl
using WooldridgeDatasets, DataFrames, Plots, KernelDensity

ceosal1 = DataFrame(wooldridge("ceosal1"))

# extract roe:
roe = ceosal1.roe

# estimate kernel density:
kde_est = KernelDensity.kde(roe)

# kernel density (a):
plot(kde_est.x, kde_est.density, color=:black, linewidth=2, legend=false)
ylabel!("density")
xlabel!("roe")
savefig("JlGraphs/Density1.pdf")

# kernel density with overlayed histogram (b):
histogram(roe, color="grey", normalize=true, legend=false)
plot!(kde_est.x, kde_est.density, color=:black, linewidth=2)
ylabel!("density")
xlabel!("roe")
savefig("JlGraphs/Density2.pdf")
Lists-Copy.jl
# define a list:
example_list = [1, 5, 41.3, 2.0]

# be careful with changes on variables pointing on example_list:
duplicate_list = example_list
duplicate_list[3] = 10000
println("duplicate_list:$duplicate_list\n")
println("example_list:$example_list\n")

# work on a copy of example_list:
example_list = [1, 5, 41.3, 2.0]
duplicate_list = example_list[:]
# alternative: duplicate_list = deepcopy(example_list)
duplicate_list[3] = 10000
println("duplicate_list:$duplicate_list\n")
println("example_list:$example_list\n")
Lists.jl
# define a vector: 
example_list = [1, 5, 41.3, 2.0] # note: integers are casted to floats automatically
type_example_list = typeof(example_list)
println("type(example_list): $type_example_list\n")

# access first entry by index:
first_entry = example_list[1]
println("first_entry: $first_entry\n")

# access second to fourth entry by index:
range2to4 = example_list[2:4]
println("range2to4: $range2to4\n")

# replace third entry by new value:
example_list[3] = 3
println("example_list: $example_list\n")

# apply a function:
function_output = minimum(example_list)
println("function_output: $function_output\n")
# apply another function:
example_list = sort(example_list)
println("example_list: $example_list\n")
# delete third element of sorted list:
deleteat!(example_list, 3)
println("example_list: $example_list\n")
Matrix-Operations.jl
# define matrices:
mat1 = [4 9 8
        2 6 3]
mat2 = [1 5 2
        6 6 0
        4 8 3]

# use exp() and apply it to each element:
result1 = exp.(mat1)
result1_rounded = round.(result1, digits=4)
println("result1_rounded: \n$result1_rounded\n")

result2 = mat1 .+ mat2[1:2, :]
println("result2: $result2\n")

# use another function:
mat1_tr = transpose(mat1) #or simply: mat1'
println("mat1_tr: $mat1_tr\n")

# matrix algebra:
matprod = mat1 * mat2
println("matprod: $matprod")
MCSim-lln-clt-n.jl
using Plots, Distributions, Random, KernelDensity

##################
## LLN (normal) ##
##################

# set the random seed:
Random.seed!(12345)

# set sample sizes and MC simulations:
n = [10, 50, 100, 1000]
r = 10000

# support of normal density (fixed):
x_range = range(8.5, 11.5, length=100)

for i in n
    ybar = zeros(r)
    for j in 1:r
        # sample of size n
        sample = rand(Normal(10, 2), i)
        ybar[j] = mean(sample)
    end
    # simulated density:
    kde = KernelDensity.kde(ybar)

    # normal density:
    y = pdf.(Normal(10, 2 / sqrt(i)), x_range)

    # plotting:
    filename = "JlGraphs/MCSim-lln-n" * string(i) * ".pdf"

    plot(kde.x, kde.density, color=:black, label="ybar")
    plot!(x_range, y, linestyle=:dash, color=:black, label="normal distribution")
    xlims!((8.5, 11.5))
    ylims!((0, 6.5))
    ylabel!("density")
    xlabel!("ybar")
    savefig(filename)
end

##################
## CLT (chisq) ##
##################

# density:
x_range = range(0, 3, length=100)

y = pdf.(Chisq(1), x_range)
plot(x_range, y, linestyle=:solid, color=:black, legend=false)
ylabel!("density")
xlabel!("x")
savefig("JlGraphs/MCSim-chisqdens.pdf")

# set the random seed:
Random.seed!(123456)

# set sample sizes and MC simulations:
n = [2, 10, 100, 10000]
r = 10000

for i in n
    ybar = zeros(r)
    for j in 1:r
        # sample of size n
        sample = rand(Chisq(1), i)
        ybar[j] = mean(sample)
    end

    # simulated density:
    kde = KernelDensity.kde(ybar)

    # normal density:
    x_range = range(minimum(ybar), maximum(ybar), length=50)
    y = pdf.(Normal(1, sqrt(2 / i)), x_range)

    # plotting:
    filename = "JlGraphs/MCSim-clt-n" * string(i) * ".pdf"
    plot(kde.x, kde.density, color=:black, label="ybar")
    plot!(x_range, y, linestyle=:dash, color=:black, label="normal distribution")
    ylabel!("density")
    xlabel!("ybar")
    savefig(filename)
end
Objects-in-Julia.jl
result1 = 1 + 1
# determine the type:
type_result1 = typeof(result1)
# print the result:
println("type_result1: $type_result1\n")

result2 = 2.5
type_result2 = typeof(result2)
println("type_result2: $type_result2\n")

result3 = "To be, or not to be: that is the question"
type_result3 = typeof(result3)
println("type_result3: $type_result3")
Package-Statistics.jl
using Statistics

a = [2, 6, 4, 9, 1]

a_mean = mean(a)
println("a_mean = $a_mean")

a_var = Statistics.var(a)
println("a_var = $a_var")
PDF-example.jl
using Plots, Distributions

# support of normal density:
x_range = range(-4, 4, length=100)

# PDF for all these values:
pdf_normal = pdf.(Normal(), x_range)

# plot:
plot(x_range, pdf_normal, color=:black, legend=false)
xlabel!("x")
ylabel!("dx")
savefig("JlGraphs/PDF-example.pdf")
PMF-binom.jl
using Distributions

# pedestrian approach:
p1 = binomial(10, 2) * (0.2^2) * (0.8^8)
println("p1 = $p1\n")

# package function:
p2 = pdf(Binomial(10, 0.2), 2)
println("p2 = $p2")
PMF-example.jl
using Distributions, DataFrames, Plots

# PMF for all values between 0 and 10:
x = 0:10
fx = pdf.(Binomial(10, 0.2), x)

# collect values in DataFrame:
result = DataFrame(x=x, fx=fx)
println("result: \n$result")

# plot:
bar(x, fx, color=:grey, alpha=0.6, legend=false)
xlabel!("x")
ylabel!("fx")
savefig("JlGraphs/PMF-example.pdf")
PyCall-Alternative.jl
using PyCall

# using pyimport to work with modules:
np = pyimport("numpy")

# define matrices in Julia:
mat1 = [4 9 8
        2 6 3]
mat2 = [1 5 2
        6 6 0
        4 8 3]

# ... and pass them to numpys dot function:
matprod = np.dot(mat1, mat2)
println("matprod: $matprod\n")

matprod_type = typeof(matprod)
println("matprod_type: $matprod_type")
PyCall-Simple.jl
using PyCall

# define a block of Python Code:
py"""
import numpy as np

# define arrays in numpy:
mat1 = np.array([[4, 9, 8],
                 [2, 6, 3]])
mat2 = np.array([[1, 5, 2],
                 [6, 6, 0],
                 [4, 8, 3]])
# matrix algebra:
matprod_py = mat1 @ mat2
"""

# automatic type conversion from Python to Julia:
matprod = py"matprod_py"
matprod_type = typeof(matprod)
println("matprod_type: $matprod_type\n")
println("matprod: $matprod")
Quantile-example.jl
using Distributions

q_975 = quantile(Normal(), 0.975)
println("q_975 = $q_975")
Random-Numbers.jl
using Distributions, Random

Random.seed!(12345)
# sample from a standard normal RV with sample size n=3:
sample1 = rand(Normal(), 3)
println("sample1: $sample1\n")

# a different sample from the same distribution:
sample2 = rand(Normal(), 3)
println("sample2: $sample2\n")

# set the seed of the random number generator and take two samples:
Random.seed!(54321)
sample3 = rand(Normal(), 3)
println("sample3: $sample3\n")

sample4 = rand(Normal(), 3)
println("sample4: $sample4\n")

# reset the seed to the same value to get the same samples again:
Random.seed!(54321)
sample5 = rand(Normal(), 3)
println("sample5: $sample5\n")

sample6 = rand(Normal(), 3)
println("sample6: $sample6")
Sample-Bernoulli.jl
using Distributions

sample = rand(Bernoulli(0.5), 10)
println("sample: $sample")
Sample-Norm.jl
using Distributions

sample = rand(Normal(), 6)
sample_rounded = round.(sample, digits=5)
println("sample_rounded: $sample_rounded")
Simulate-Estimate.jl
using Distributions, Random

# set the random seed:
Random.seed!(12345)

# set sample size:
n = 100

# draw a sample given the population parameters:
sample1 = rand(Normal(10, 2), n)

# estimate the population mean with the sample average:
estimate1 = mean(sample1)
println("estimate1 = $estimate1\n")

# draw a different sample and estimate again:
sample2 = rand(Normal(10, 2), n)
estimate2 = mean(sample2)
println("estimate2 = $estimate2\n")

# draw a third sample and estimate again:
sample3 = rand(Normal(10, 2), n)
estimate3 = mean(sample3)
print("estimate3: $estimate3")
Simulation-Inference-Figure.jl
using Distributions, HypothesisTests, Plots, Random

# set the random seed:
Random.seed!(12345)

# set sample size and MC simulations:
r = 10000
n = 100

# initialize arrays to later store results:
CIlower = zeros(r)
CIupper = zeros(r)
pvalue1 = zeros(r)
pvalue2 = zeros(r)


# repeat r times:
for j in 1:r
    # draw a sample
    sample = rand(Normal(10, 2), n)
    sample_mean = mean(sample)
    sample_sd = std(sample)

    # test the (correct) null hypothesis mu=10:
    testres1 = OneSampleTTest(sample, 10)
    pvalue1[j] = pvalue(testres1)
    cv = quantile(TDist(n - 1), 0.975)
    CIlower[j] = sample_mean - cv * sample_sd / sqrt(n)
    CIupper[j] = sample_mean + cv * sample_sd / sqrt(n)

    # test the (incorrect) null hypothesis mu=9.5 & store the p value:
    testres2 = OneSampleTTest(sample, 9.5)
    pvalue2[j] = pvalue(testres2)

end

##################
## correct H0 ##
##################
plot(legend=false, size=(300, 500)) # initialize plot and set figure ratio
ylims!((0, 101))
xlims!((9, 11))
vline!([10], linestyle=:dash, color=:black, linewidth=0.5)
ylabel!("Sample No.")

for j in 1:100
    if 10 > CIlower[j] && 10 < CIupper[j]
        plot!([CIlower[j], CIupper[j]], [j, j], linestyle=:solid, color=:grey)
    else
        plot!([CIlower[j], CIupper[j]], [j, j], linestyle=:solid, color=:black)
    end
end
savefig("JlGraphs/Simulation-Inference-Figure1.pdf")

##################
## incorrect H0 ##
##################
plot(legend=false, size=(300, 500)) # initialize plot and set figure ratio
ylims!((0, 101))
xlims!((9, 11))
vline!([9.5], linestyle=:dash, color=:black, linewidth=0.5)
ylabel!("Sample No.")

for j in 1:100
    if 9.5 > CIlower[j] && 9.5 < CIupper[j]
        plot!([CIlower[j], CIupper[j]], [j, j], linestyle=:solid, color=:grey)
    else
        plot!([CIlower[j], CIupper[j]], [j, j], linestyle=:solid, color=:black)
    end
end
savefig("JlGraphs/Simulation-Inference-Figure2.pdf")
Simulation-Inference.jl
using Distributions, Random, HypothesisTests

# set the random seed:
Random.seed!(12345)

# set sample size and MC simulations:
r = 10000
n = 100

# initialize arrays to later store results:
CIlower = zeros(r)
CIupper = zeros(r)
pvalue1 = zeros(r)
pvalue2 = zeros(r)


# repeat r times:
for j in 1:r
    # draw a sample
    sample = rand(Normal(10, 2), n)
    sample_mean = mean(sample)
    sample_sd = std(sample)

    # test the (correct) null hypothesis mu=10:
    testres1 = OneSampleTTest(sample, 10)
    pvalue1[j] = pvalue(testres1)
    cv = quantile(TDist(n - 1), 0.975)
    CIlower[j] = sample_mean - cv * sample_sd / sqrt(n)
    CIupper[j] = sample_mean + cv * sample_sd / sqrt(n)

    # test the (incorrect) null hypothesis mu=9.5 & store the p value:
    testres2 = OneSampleTTest(sample, 9.5)
    pvalue2[j] = pvalue(testres2)

end

# test results as logical value:
reject1 = pvalue1 .<= 0.05
count1_true = count(reject1) # counts true
count1_false = r - count1_true
println("count1_true: $count1_true\n")
println("count1_false: $count1_false\n")

reject2 = pvalue2 .<= 0.05
count2_true = count(reject2)
count2_false = r - count2_true
println("count2_true: $count2_true\n")
println("count2_false: $count2_false")
Simulation-Repeated-Results.jl
using Distributions, Random, KernelDensity, Plots

# set the random seed:
Random.seed!(12345)

# set sample size:
n = 100

# initialize ybar to an array of length r=10000 to later store results:
r = 10000
ybar = zeros(r)

# repeat r times:
for j in 1:r
    # draw a sample and store the sample mean in pos. j=1,... of ybar:
    sample = rand(Normal(10, 2), n)
    ybar[j] = mean(sample)
end

# the first 8 of 10000 estimates:
ybar_preview = round.(ybar[1:8], digits=4)
println("ybar_preview: \n$ybar_preview\n")

# simulated mean:
mean_ybar = mean(ybar)
println("mean_ybar = $mean_ybar\n")

# simulated variance:
var_ybar = var(ybar)
println("var_ybar = $var_ybar")

# simulated density:
kde = KernelDensity.kde(ybar)

# normal density:
x_range = range(9, 11, length=100)
y = pdf.(Normal(10, sqrt(0.04)), x_range)

# create graph:
plot(kde.x, kde.density, color=:black, label="ybar", linewidth=2)
plot!(x_range, y, linestyle=:dash, color=:black,
    label="normal distribution", linewidth=2)
ylabel!("density")
xlabel!("ybar")
savefig("JlGraphs/Simulation-Repeated-Results.pdf")
Simulation-Repeated.jl
using Distributions, Random

# set the random seed:
Random.seed!(12345)

# set sample size:
n = 100

# initialize ybar to an array of length r=10000 to later store results:
r = 10000
ybar = zeros(r)

# repeat r times:
for j in 1:r
    # draw a sample and store the sample mean in pos. j=1,... of ybar:
    sample = rand(Normal(10, 2), n)
    ybar[j] = mean(sample)
end
Wooldridge.jl
using WooldridgeDatasets, DataFrames

# load data:
wage1 = DataFrame(wooldridge("wage1"))

# get type:
type_wage1 = typeof(wage1)
println("type_wage1: $type_wage1\n")

# get first four observations and first eight variables:
preview_wage1 = wage1[1:4, 1:8]
println("preview_wage1: \n$preview_wage1")