Explore The Data

# Load text file into local variable called 'data'
data = read.delim(file = 'purchases.txt', header = FALSE, sep = '\t', dec = '.')
# Display what has been loaded
head(data)
# FOR EACH COLUMN prints min, quadrants, median, mean, max
summary(data)
       V1               V2                   V3       
 Min.   :    10   Min.   :   5.00   2013-12-31:  864  
 1st Qu.: 57720   1st Qu.:  25.00   2006-12-31:  584  
 Median :102440   Median :  30.00   2012-12-31:  583  
 Mean   :108935   Mean   :  62.34   2011-12-31:  510  
 3rd Qu.:160525   3rd Qu.:  60.00   2008-12-31:  503  
 Max.   :264200   Max.   :4500.00   2014-12-31:  485  
                                    (Other)   :47714  

Transform The Data

# Add headers and interpret the last column as a date, extract year of purchase
# rename columns to human understandable representations
colnames(data) = c('customer_id', 'purchase_amount', 'date_of_purchase')
# tell R this column is a date variable
data$date_of_purchase = as.Date(data$date_of_purchase, "%Y-%m-%d")
# create a new column that contains the year of purchase
data$year_of_purchase = as.numeric(format(data$date_of_purchase, "%Y"))
# Display the data set after transformation
head(data)
summary(data)
  customer_id     purchase_amount   date_of_purchase    
 Min.   :    10   Min.   :   5.00   Min.   :2005-01-02  
 1st Qu.: 57720   1st Qu.:  25.00   1st Qu.:2009-01-17  
 Median :102440   Median :  30.00   Median :2011-11-23  
 Mean   :108935   Mean   :  62.34   Mean   :2011-07-14  
 3rd Qu.:160525   3rd Qu.:  60.00   3rd Qu.:2013-12-29  
 Max.   :264200   Max.   :4500.00   Max.   :2015-12-31  
 year_of_purchase
 Min.   :2005    
 1st Qu.:2009    
 Median :2011    
 Mean   :2011    
 3rd Qu.:2013    
 Max.   :2015    

Use SQL and visualize the Data

# To explore the data using simple SQL statements
# we'll use sqldf package which allows us to use a
# SQL database like an R dataframe & a dataframe like a SQL DB
library(sqldf)
# Number of purchases per year
## SELECT sql statement which is used to fetch the data from a database
## and returns results in the form of a data table
# see how many purchases were made each year from data set 'data':
# 'SELECT each year_of_purchase and COUNT each instance of year_of_purchase
x = sqldf("SELECT year_of_purchase, COUNT(year_of_purchase) AS 'counter' FROM data GROUP BY 1 ORDER BY 1")
# now plot the counted number of purchases by year as a bar plot
barplot(x$counter, names.arg = x$year_of_purchase)

# Average purchase amount per year
x = sqldf("SELECT year_of_purchase, AVG(purchase_amount) AS 'avg_amount' FROM data GROUP BY 1 ORDER BY 1")
barplot(x$avg_amount, names.arg = x$year_of_purchase)

# Total purchase amounts per year
x = sqldf("SELECT year_of_purchase, SUM(purchase_amount) AS 'sum_amount' FROM data GROUP BY 1 ORDER BY 1")
barplot(x$sum_amount, names.arg = x$year_of_purchase)

# All in one
x = sqldf("SELECT year_of_purchase,
                  COUNT(year_of_purchase) AS 'counter',
                  AVG(purchase_amount) AS 'avg_amount',
                  SUM(purchase_amount) AS 'sum_amount'
           FROM data GROUP BY 1 ORDER BY 1")
print(x)
LS0tCnRpdGxlOiAiV2VlayAxIE5vdGVib29rIgpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sKLS0tCgojIyBFeHBsb3JlIFRoZSBEYXRhCgpgYGB7ciBleHBsb3JlLWRhdGF9CiMgTG9hZCB0ZXh0IGZpbGUgaW50byBsb2NhbCB2YXJpYWJsZSBjYWxsZWQgJ2RhdGEnCmRhdGEgPSByZWFkLmRlbGltKGZpbGUgPSAncHVyY2hhc2VzLnR4dCcsIGhlYWRlciA9IEZBTFNFLCBzZXAgPSAnXHQnLCBkZWMgPSAnLicpCgojIERpc3BsYXkgd2hhdCBoYXMgYmVlbiBsb2FkZWQKaGVhZChkYXRhKQojIEZPUiBFQUNIIENPTFVNTiBwcmludHMgbWluLCBxdWFkcmFudHMsIG1lZGlhbiwgbWVhbiwgbWF4CnN1bW1hcnkoZGF0YSkKYGBgCgoKIyMgVHJhbnNmb3JtIFRoZSBEYXRhCgpgYGB7ciB0cmFuc2Zvcm0tZGF0YX0KIyBBZGQgaGVhZGVycyBhbmQgaW50ZXJwcmV0IHRoZSBsYXN0IGNvbHVtbiBhcyBhIGRhdGUsIGV4dHJhY3QgeWVhciBvZiBwdXJjaGFzZQojIHJlbmFtZSBjb2x1bW5zIHRvIGh1bWFuIHVuZGVyc3RhbmRhYmxlIHJlcHJlc2VudGF0aW9ucwpjb2xuYW1lcyhkYXRhKSA9IGMoJ2N1c3RvbWVyX2lkJywgJ3B1cmNoYXNlX2Ftb3VudCcsICdkYXRlX29mX3B1cmNoYXNlJykKIyB0ZWxsIFIgdGhpcyBjb2x1bW4gaXMgYSBkYXRlIHZhcmlhYmxlCmRhdGEkZGF0ZV9vZl9wdXJjaGFzZSA9IGFzLkRhdGUoZGF0YSRkYXRlX29mX3B1cmNoYXNlLCAiJVktJW0tJWQiKQojIGNyZWF0ZSBhIG5ldyBjb2x1bW4gdGhhdCBjb250YWlucyB0aGUgeWVhciBvZiBwdXJjaGFzZQpkYXRhJHllYXJfb2ZfcHVyY2hhc2UgPSBhcy5udW1lcmljKGZvcm1hdChkYXRhJGRhdGVfb2ZfcHVyY2hhc2UsICIlWSIpKQpgYGAKCmBgYHtyIGRpc3BsYXktdHJhbnNmb3JtfQojIERpc3BsYXkgdGhlIGRhdGEgc2V0IGFmdGVyIHRyYW5zZm9ybWF0aW9uCmhlYWQoZGF0YSkKc3VtbWFyeShkYXRhKQpgYGAKCgojIyBVc2UgU1FMIGFuZCB2aXN1YWxpemUgdGhlIERhdGEKCmBgYHtyIHVzaW5nLXNxbCwgd2FybmluZz1GQUxTRSwgbWVzc2FnZT1GQUxTRX0KIyBUbyBleHBsb3JlIHRoZSBkYXRhIHVzaW5nIHNpbXBsZSBTUUwgc3RhdGVtZW50cwojIHdlJ2xsIHVzZSBzcWxkZiBwYWNrYWdlIHdoaWNoIGFsbG93cyB1cyB0byB1c2UgYQojIFNRTCBkYXRhYmFzZSBsaWtlIGFuIFIgZGF0YWZyYW1lICYgYSBkYXRhZnJhbWUgbGlrZSBhIFNRTCBEQgpsaWJyYXJ5KHNxbGRmKQoKCiMgTnVtYmVyIG9mIHB1cmNoYXNlcyBwZXIgeWVhcgojIyBTRUxFQ1Qgc3FsIHN0YXRlbWVudCB3aGljaCBpcyB1c2VkIHRvIGZldGNoIHRoZSBkYXRhIGZyb20gYSBkYXRhYmFzZQojIyBhbmQgcmV0dXJucyByZXN1bHRzIGluIHRoZSBmb3JtIG9mIGEgZGF0YSB0YWJsZQojIHNlZSBob3cgbWFueSBwdXJjaGFzZXMgd2VyZSBtYWRlIGVhY2ggeWVhciBmcm9tIGRhdGEgc2V0ICdkYXRhJzoKIyAnU0VMRUNUIGVhY2ggeWVhcl9vZl9wdXJjaGFzZSBhbmQgQ09VTlQgZWFjaCBpbnN0YW5jZSBvZiB5ZWFyX29mX3B1cmNoYXNlCnggPSBzcWxkZigiU0VMRUNUIHllYXJfb2ZfcHVyY2hhc2UsIENPVU5UKHllYXJfb2ZfcHVyY2hhc2UpIEFTICdjb3VudGVyJyBGUk9NIGRhdGEgR1JPVVAgQlkgMSBPUkRFUiBCWSAxIikKCiMgbm93IHBsb3QgdGhlIGNvdW50ZWQgbnVtYmVyIG9mIHB1cmNoYXNlcyBieSB5ZWFyIGFzIGEgYmFyIHBsb3QKYmFycGxvdCh4JGNvdW50ZXIsIG5hbWVzLmFyZyA9IHgkeWVhcl9vZl9wdXJjaGFzZSkKCiMgQXZlcmFnZSBwdXJjaGFzZSBhbW91bnQgcGVyIHllYXIKeCA9IHNxbGRmKCJTRUxFQ1QgeWVhcl9vZl9wdXJjaGFzZSwgQVZHKHB1cmNoYXNlX2Ftb3VudCkgQVMgJ2F2Z19hbW91bnQnIEZST00gZGF0YSBHUk9VUCBCWSAxIE9SREVSIEJZIDEiKQpiYXJwbG90KHgkYXZnX2Ftb3VudCwgbmFtZXMuYXJnID0geCR5ZWFyX29mX3B1cmNoYXNlKQoKIyBUb3RhbCBwdXJjaGFzZSBhbW91bnRzIHBlciB5ZWFyCnggPSBzcWxkZigiU0VMRUNUIHllYXJfb2ZfcHVyY2hhc2UsIFNVTShwdXJjaGFzZV9hbW91bnQpIEFTICdzdW1fYW1vdW50JyBGUk9NIGRhdGEgR1JPVVAgQlkgMSBPUkRFUiBCWSAxIikKYmFycGxvdCh4JHN1bV9hbW91bnQsIG5hbWVzLmFyZyA9IHgkeWVhcl9vZl9wdXJjaGFzZSkKCiMgQWxsIGluIG9uZQp4ID0gc3FsZGYoIlNFTEVDVCB5ZWFyX29mX3B1cmNoYXNlLAogICAgICAgICAgICAgICAgICBDT1VOVCh5ZWFyX29mX3B1cmNoYXNlKSBBUyAnY291bnRlcicsCiAgICAgICAgICAgICAgICAgIEFWRyhwdXJjaGFzZV9hbW91bnQpIEFTICdhdmdfYW1vdW50JywKICAgICAgICAgICAgICAgICAgU1VNKHB1cmNoYXNlX2Ftb3VudCkgQVMgJ3N1bV9hbW91bnQnCiAgICAgICAgICAgRlJPTSBkYXRhIEdST1VQIEJZIDEgT1JERVIgQlkgMSIpCnByaW50KHgpCmBgYA==