5+7
x<-5+7 #x equals to five plus seven
x
y<-x-3
y
z<-c(1,2.2,3)
z
#for help use ?funcation_name will give the documentation
?c
#combining the vectors
c(z,555,z)
#operations on vector
z*2+100
+
, -
, /
, and ^
(where x^2 means 'x squared'). To take the square root, use the sqrt() function and to take the absolute value, use the abs() function.¶my_sqrt<-sqrt(z-1)
my_sqrt
my_div<-z/my_sqrt
my_div
+
, -
, *
, etc.) element-by-element. If the vectors are of different lengths, R 'recycles' the shorter vector until it is the same length as the longer vector.¶#example
c(1,2,3,4)+c(0,10)
#incase longer vetor is not a multiple of shorter vector
c(1,2,3,4)+c(0,10,100)
#Determine which directory your R session is using as its current working directory using getwd().
getwd()
# List all the objects in your local workspace using ls()
ls()
#List all the files in your working directory using list.files() or dir().
list.files()
dir()
#Using the args() function on a function name is also a handy way to see what arguments a function can take.
args(list.files)
#Use dir.create() to create a directory in the current working directory called "testdir".
dir.create("testdir")
#Set your working directory to "testdir" with the setwd() command.
setwd("testdir")
#Create a file in your working directory called "mytest.R" using the file.create() function.
file.create("mytest.R")
list.files()
#Check to see if "mytest.R" exists in the working directory using the file.exists() function.
file.exists("mytest.R")
#Access information about the file "mytest.R" by using file.info().
file.info("mytest.R")
#You can use the $ operator --- e.g., file.info("mytest.R")$mode --- to grab specific items.
#Change the name of the file "mytest.R" to "mytest2.R" by using file.rename().
file.rename("mytest.R","mytest2.R")
#Make a copy of "mytest2.R" called "mytest3.R" using file.copy().
file.copy("mytest2.R","mytest3.R")
#Provide the relative path to the file "mytest3.R" by using file.path().
file.path("mytest3.R")
#You can use file.path to construct file and directory paths that are independent of the operating system
#your R code is running on. Pass 'folder1' and 'folder2' as arguments to file.path to make a
# platform-independent pathname.
file.path("folder1","folder2")
# Create a directory in the current working directory called "testdir2" and a subdirectory for it called
# "testdir3", all in one command by using dir.create() and file.path().
dir.create(file.path("testdir2","testdir3"), recursive = TRUE)
# To delete a directory you need to use the recursive = TRUE argument with the function unlink(). If you
# don't use recursive = TRUE, R is concerned that you're unaware that you're deleting a directory and all
# of its contents. R reasons that, if you don't specify that recursive equals TRUE, you don't know that
# something is in the directory you're trying to delete. R tries to prevent you from making a mistake.
unlink("testdir2") #dosen't work
unlink("testdir2", recursive = TRUE) #works
getwd()
setwd('/Users/prashanth/DS-14.310x')
getwd()
list.files()
#Delete the 'testdir' directory that you just left (and everything in it)
unlink("testdir", recursive = TRUE)
#The simplest way to create a sequence of numbers in R is by using the `:` operator. Type 1:20 to see how it works.
1:20
pi:10
15:1
#Documentation for operators, Pull up the documentation for `:` now.
?`:`
# Often, we'll desire more control over a sequence we're creating than what the `:` operator gives us. The
# seq() function serves this purpose.
seq(1,20)
seq(1,10, by=0.5) #by half increments
my_seq<-seq(5, 10,length=30)
my_seq#30 breaks between numbers
length(my_seq)
1:length(my_seq)
seq(along.with=my_seq)
seq_along(my_seq)
#One more function related to creating sequences of numbers is rep(), which stands for 'replicate'. Let's
#look at a few uses.
rep(0, times= 40)
rep(c(0,1,2), times = 10)#repeating vector
rep(c(0, 1, 2), each = 10) # repeat each number 10 times
num_vect<-c(0.5,55,-10,6)
tf <- num_vect < 1
tf
num_vect>=6
<
and >=
symbols in these examples are called 'logical operators'. Other logical operators include >
, <=
, ==
for exact equality, and !=
for inequality.¶(3 > 5) & (4 == 4)
(TRUE == TRUE) | (TRUE == FALSE)
((111 >= 111) | !(TRUE)) & ((4 + 1) == 5)
# Create a character vector that contains the following words: "My", "name", "is". Remember to enclose each
# word in its own set of double quotes, so that R knows they are character strings. Store the vector in a
# variable called my_char.
my_char<-c("My","name","is")
my_char
paste(my_char, collapse = " ")#combines the strings in a vector
my_name=c(my_char,"chika chika slam shady")#string concatination
my_name
paste(my_name, collapse = " ")
paste("Hello", "world!", sep = " ")
paste(1:3,c("X","Y","Z"),sep="") #integrs and charactors
#Try paste(LETTERS, 1:4, sep = "-"), where LETTERS is a predefined variable in R
# containing a character vector of all 26 letters in the English alphabet.
paste(LETTERS, 1:4, sep = "-")
x<-c(44,NA,5,NA)
x*3
y <- rnorm(1000) # vector containing 1000 draws from a standard normal distribution
z<- rep(NA, 1000) # vector of NA's
my_data <- sample(c(y,z), 100) #collecting random 100 sample from both the vectors
my_na <- is.na(my_data) #TRUE if value is NA else FALSE
my_na
my_data == NA # wont work , just gives NA's of vector lenght. Careful !
sum(my_na) #sum will give us how many TRUE
#let's look at a second type of missing value -- NaN, which stands for 'not a number'.
0/0
Inf-Inf #Inf stands for infinity
x <- rep(c(NA,2.5,NA,-1),10) #sample vector
x
#The way you tell R that you want to select some particular elements
#(i.e. a 'subset') from a vector is by placing an 'index vector' in
#square brackets immediately following the name of the vector.
x[1:10] #first 10 elements
# Index vectors come in four different flavors -- logical vectors, vectors
# of positive integers, vectors of negative integers, and vectors of
# character strings
x[is.na(x)] # gives all NA's in vector
y<-x[!is.na(x)]
y #!is.na() is used - negation '!'. Gives all non NA elements
y[y>0] #all y values where y>0
x[!is.na(x) & x>0] # combination of above commands
x[1] #is the 1st element
x[c(3,4,7)] #3rd 4th and 7th element
x[0] #gives nothing
x[3000] #gives NA hence be carful about the lenght of the vector
x[c(-2,-10)] #gives all the elemnts other than 2nd and 10th
x[-c(2,10)] #similar to above command
vect <- c(foo = 11, bar= 2, norf=NA) #named index vectors
vect
names(vect) #gives all the names
vect2 <- c(11,2,NA) #creating the vector
names(vect2) <- c("foo","bar","norf") #assigning the names
vect2
identical(vect,vect2) #checks for identical vectors
vect["bar"] #selecting based on name
vect[c("foo","bar")] #multiple selection based on name
# | The main difference, as you'll see, is that matrices can only contain a
# | single class of data, while data frames can consist of many different
# | classes of data.
my_vector <- 1:20
my_vector
dim(my_vector) #vector has no dimensions
length(my_vector) #but it has length
dim(my_vector)<- c(4,5) #assigning dimensions 4 rows and 5 column
my_vector
dim(my_vector)
attributes(my_vector)
class(my_vector) #now its type matrix
my_matrix <- my_vector
my_matrix2 = matrix(data=1:20, nrow= 4, ncol= 5) #another way of creating the matrix
my_matrix2
identical(my_matrix,my_matrix2)
patients<- c("Bill","Gina","Kelly","Sean")
cbind(patients,my_matrix) #converts every element in the matrix to string which is not good for working with numbers.
#This is called 'implicit coercion', because we didn't ask for it.
#Hence better way to do it use data frames
my_data <- data.frame(patients,my_matrix)
my_data
# Behind the scenes, the data.frame() function takes any number of
#| arguments and returns a single object of class `data.frame` that is
#| composed of the original objects.
class(my_data)
cnames <- c("patient","age","weight","bp","rating","test")
colnames(my_data) <- cnames #adding column name to data frame
my_data
# | Whenever you're working with a new dataset, the first thing you should
# | do is look at it! What is the format of the data? What are the
# | dimensions? What are the variable names? How are the variables stored?
# | Are there missing data? Are there any flaws in the data?
laliga=read.csv("SP1.csv")
ls()
class(laliga) #object type
dim(laliga) #dimensions
nrow(laliga) # number of rows
ncol(laliga) #number of columns
object.size(laliga) #size of the file interms of space occupied on machine
names(laliga) #column names
head(laliga) #first 6 rows deafult
head(laliga,10) #first 10 rows
tail(laliga,15) #last 15 rows
summary(laliga) #summary!!!
table(laliga$HomeTeam) #table for column Home Team
str(laliga) #structure if data
data(cars)
head(cars)
options(repr.plot.width=4, repr.plot.height=4) #reduce the size of the graph , other wise fills up the screen
plot(cars) #choose first column as x axis and second for y
plot(x=cars$speed, y=cars$dist) #specifying the axis
plot(y=cars$speed, x=cars$dist) #swtiching the axis from above
plot(x=cars$speed, y=cars$dist, xlab= "Speed") #labelling the x-axis
plot(x=cars$speed, y=cars$dist, xlab= "Speed",ylab = "Stopiing Distance") #labeling y-axis
plot(cars,main="My Plot") #title
plot(cars,sub="My Plot Subtitle") #sub title
plot(cars,col=2) #change color for the points
plot(cars,xlim=c(10,15)) #limiting the x-axis
plot(cars,pch=2) #chaning point icon
data(mtcars) #loding data-mtcars
str(mtcars)
head(mtcars)
boxplot(mpg ~ cyl , data=mtcars) #box plot
hist(mtcars$mpg) #histogram
library("dplyr") #loading the package
packageVersion("dplyr") #check the version
mydf=read.csv("SP1.csv") #reading the data set to mydf
cran<-tbl_df(mydf) #"The main advantage to using a tbl_df over a regular data frame is the printing."
cran #jupyter notebook dosen't show tbl_df well
head(select(cran,HomeTeam,AwayTeam,FTAG,FTHG)) #select columns needed , note the order specefied is maintained
head(select(cran,HomeTeam:FTR)) #selects all column from HomeTeam to FTR
head(select(cran,FTR:HomeTeam)) #also possible in reverse order
head(select(cran,HomeTeam:FTR, -FTAG)) # -column name dosent select specefied column name
cran_sub<-select(cran, -(HS:PSCA), -Div)#removes all the columns from HTR to PSCA
head(cran_sub)
head(filter(cran_sub, HomeTeam == "Barcelona")) #only rows were HomeTeam is Barcelona
filter(cran_sub, HomeTeam == "Barcelona", FTR== "D") # rows where HomeTeam is Barcelona and FTR is D (draw at home)
filter(cran_sub, HomeTeam == "Barcelona", FTHG>3) # adding logical operators
# rows where HomeTeam is Barcelona and HTHG is more than 3
#(time Barcelona scored more than 3 goals at home)
head(filter(cran_sub, AwayTeam == "Barcelona" | HomeTeam =="Barcelona")) #where rows either home or away team is
#Barcelona
filter(cran_sub, AwayTeam == "Barcelona", HTHG>HTAG , FTAG>=FTHG ) #Barcelona away game trailing at half time
# but won the game or draw full before full time
filter(cran_sub, is.na(FTHG)) #no missing values in FTHG column
head(filter(cran_sub, !is.na(FTHG))) #adding !is.na() will remove all NAs in the rows.
head(arrange(cran_sub, FTHG)) #arranges by FTHG values assending
head(arrange(cran_sub, desc(FTHG))) #desc() sorts by decending
head(arrange(cran_sub, HomeTeam, desc(FTHG))) #first sorts HomeTeam ascending and then FTHG by desending
cran_GD <- mutate(cran_sub, GD = FTHG-FTAG)
head(cran_GD) #creats new column GD = FTHG - FTAG
#similary can add , subtract multiply and divide value to columns and creat new columns
summarise(cran_sub, AHG = mean(FTHG)) #gives you summary of the column
#average home goal
summarise(cran_sub, AAG = mean(FTAG)) #average away goal
summarise(cran_GD, AGD = mean(abs(GD))) #average goal diff
library("dplyr")
mydf=read.csv("SP1.csv")
cran<-tbl_df(mydf)
cran_sub<-select(cran, -(HS:PSCA), -Div)
head(cran_sub)
summarise(cran_sub, count =n())
by_team <- group_by(cran_sub, HomeTeam) # group by very important function for data analysis
team_sum = summarise(by_team, count =n(), unique = n_distinct(FTHG), avg_hg = mean(FTHG))
head(team_sum)
#all team play 19 home game, with unique home goals and their avg home goals
# | We need to know the value of 'count' that splits the data into
# | the top 1% and bottom 99% of packages based on total
# | downloads. In statistics, this is called the 0.99, or 99%,
# | sample quantile. Use quantile(pack_sum$count, probs = 0.99) to
# | determine this number.
quantile(team_sum$avg_hg, probs = 0.90) #2.5 and above is top 90%
filter(team_sum, avg_hg >2.5) #only RM and FCB are more than 90%
arrange(filter(team_sum, avg_hg >2.5), desc(avg_hg)) #sorting
# | In this script, we've used a special chaining operator, %>%,
# | which was originally introduced in the magrittr R package and
# | has now become a key component of dplyr. You can pull up the
# | related documentation with ?chain. The benefit of %>% is that
# | it allows us to chain the function calls in a linear fashion.
# | The code to the right of %>% operates on the result from the
# | code to the left of %>%.
cran_sub %>% group_by(HomeTeam) %>%
summarise(count =n(), unique = n_distinct(FTHG), avg_hg = mean(FTHG)) %>%
filter(avg_hg >2.5) %>%
arrange(desc(avg_hg))
#1. group by HomeTeam
#2. summariese data
#3. filter based on condition
#4. arrange
#with out saving the varibale and in linear fastion