setwd("/home/perovich/Desktop/week12/danceStats")

#Read in the files: created through python script w/Rpy2
salsa<-read.csv("data/salsa.csv")
chacha<-read.csv("data/chacha.csv")
arumba<-read.csv("data/amRumba.csv")
irumba<-read.csv("data/intRumba.csv")
swing<-read.csv("data/swing.csv")

#Make sure there are no dummy rows around--for now that means all feet are L (since right has yet to make it into the code)
salsa<-subset(salsa, salsa$foot=="L")
chacha<-subset(chacha, chacha$foot=="L")
arumba<-subset(arumba, arumba$foot=="L")
irumba<-subset(irumba, irumba$foot=="L")
swing<-subset(swing, swing$foot=="L")

dances<-list(salsa, chacha, arumba, irumba, swing)
names(dances)<-c("salsa", "chacha", "arumba", "irumba", "swing")

#Truncate a bit to drop out the start/end junk 
#(this shouldn't matter much because there's a lot of data to wash it out--but just in case)
cutter<-function(x,y){
  #x is the dataset
  #y is the total percentage of the data you want to drop (half from each end) (as a decimal)
  rowcount<-nrow(x)
  rowcutends<-floor(rowcount*y/2)
  x2<-x[rowcutends:(rowcount-rowcutends),]
  return(x2)
}

#Function that makes the pressure data yes/no
binaried<-function(x, y){
  #x is the dataset with column "press", y is the numeric cutpoint for 0 vs 1 binning
  x$press<-ifelse(x$press<y, 0, 1)
  return(x)
}

#Cutting, on/off, summary stats
dancesB<-dances
dancesSum<-dances
dancesBSum<-dances
for (i in 1:length(dances)){
  dances[[i]]<-cutter(dances[[i]], .02)
  dancesB[[i]]<-binaried(dances[[i]], 20)
  dancesSum[[i]]<-summary(dances[[i]]$press,  options(scipen=2))
  dancesBSum[[i]]<-summary(dancesB[[i]]$press,  options(scipen=2))
  }

dancesBSum
dancesSum
#Moral of the summary story (without limited data--need more reps of each dance)
#swing has much higher median and mean than anything
#American rumba has much higher mean than international rumba
#chacha and salsa summary stats look pretty similar
#(could probably do some basic differentiation from this alone if you limit the dances)
#(hopefully--speed or person might change this)


#make some graphs--everything and smaller windows to try to catch cycles
for (i in 1:length(dances))
  plot(dances[[i]]$ttime, dances[[i]]$press, xlim=c(500, 1000), main=names(dances)[[i]])
#Visually I can see patterns and repetition here, not entirely sure how to describe
#Also, sensor seems to favor "on" over "off" (or standing on L foot favored across all dances)
#I think it just takes awhile for the connection to drop.

#Approaches--pull just the zero times (or <50)
#ratio time on/off for each?
#time patterns?

#Plotting binary versions make some graphs--everything and smaller windows to try to catch cycles
for (i in 1:length(dancesB))
  plot(dancesB[[i]]$ttime, dancesB[[i]]$press, xlim=c(500, 1000), main=names(dancesB)[[i]])


#Mess with some modeling options

#plain probit
#http://www.ats.ucla.edu/stat/r/dae/probit.htm
dancesProbit<-dancesB
for (i in 1:length(dancesB))
dancesProbit[[i]]<- glm(press ~ ttime, family = binomial(link = "probit"), 
                data = dancesB[[i]])

## model summary
for (i in 1:length(dancesProbit)){
print(names(dancesProbit)[[i]])
print(signif(summary(dancesProbit[[i]])$coefficients,2))
}
#Reasonable-ish split on the ttime estimated values?


#Try time series:
dancesT<-dances
dancesBT<-dancesB
for (i in 1:length(dancesT)){
  dancesT[[i]]<-ts(dancesT[[i]]$press)
  dancesBT[[i]]<-ts(dancesBT[[i]]$press)
}
#Estimates of the autocovariance or autocorrelation function
for (i in 1:length(dancesBT)){
  jpeg(paste("plots/acfBinary", names(dancesBT)[[i]], ".jpg", sep=''))
  print(acf(dancesBT[[i]], main=names(dancesBT)[[i]]))
  dev.off()
  jpeg(paste("plots/acf", names(dancesT)[[i]], ".jpg", sep=''))
  print(acf(dancesT[[i]], main=names(dancesT)[[i]]))
  dev.off()
  }
#THIS IS USEFUL!  USE THIS ONE MORE!


#These aren't necessarily ideal looking, but some patterns.  Autocorrelation is probably better
for (i in 1:length(dances))
  plot.ts(dances[[i]]$press, xlim=c(300,550), main=names(dances)[[i]])


#Try someone's function for finding the period of a time series
#http://stats.stackexchange.com/questions/1207/period-detection-of-a-generic-time-series
find.freq <- function(x)
{
  n <- length(x)
  spec <- spec.ar(c(x),plot=FALSE)
  if(max(spec$spec)>10) # Arbitrary threshold chosen by trial and error.
  {
    period <- round(1/spec$freq[which.max(spec$spec)])
    if(period==Inf) # Find next local maximum
    {
      j <- which(diff(spec$spec)>0)
      if(length(j)>0)
      {
        nextmax <- j[1] + which.max(spec$spec[j[1]:500])
        period <- round(1/spec$freq[nextmax])
      }
      else
        period <- 1
    }
  }
  else
    period <- 1
  return(period)
}

#Try it on the regular looking time series data
dancesFreq<-dancesT
dancesFreq2<-data.frame(names(dancesFreq), c(1:length(dancesFreq)))
colnames(dancesFreq2)<-c("dance", "Freq")
for (i in 1:length(dancesFreq)){
dancesFreq[[i]]<-find.freq(dancesT[[i]])
dancesFreq2$Freq[[i]]<-dancesFreq[i][[1]]
}

#Try it on the binary version of the time series data
dancesFreqB<-dancesBT
for (i in 1:length(dancesFreqB))
  dancesFreqB[[i]]<-find.freq(dancesBT[[i]])
#DOESN"T WORK ON THE BINARY VERSION.

#ANOTHER PLAN: Find the change points and do something with them....(diffeq)


###########################################################################
####################### REAL DATA #########################################
###########################################################################
#See if I can match new data to the existing dance data.

#Read in the files: created through python script w/Rpy2
salsaT1<-read.csv("data/salsaTest1.csv")
chachaT1<-read.csv("data/chachaTest1.csv")
arumbaT1<-read.csv("data/AmRumbaTest1.csv")
irumbaT1<-read.csv("data/intRumbaTest1.csv")
swingT1<-read.csv("data/swingTest1.csv")
salsaT2<-read.csv("data/salsaTest2.csv")
chachaT2<-read.csv("data/chachaTest2.csv")
arumbaT2<-read.csv("data/AmRumbaTest2.csv")
irumbaT2<-read.csv("data/intRumbaTest2.csv")
swingT2<-read.csv("data/swingTest2.csv")


tests<-list(salsaT1,salsaT2, chachaT1, chachaT2, arumbaT1, arumbaT2, irumbaT1, irumbaT2, swingT1, swingT2)
names(tests)<-c("salsaT1","salsaT2", "chachaT1", "chachaT2", "arumbaT1", "arumbaT2", "irumbaT1", "irumbaT2", "swingT1", "swingT2")

#Cleaning, binary versions, summary stats etc
testsB<-tests
testsSum<-tests
for (i in 1:length(tests)){
  tests[[i]]<-subset(tests[[i]], tests[[i]]$foot=="L")
  tests[[i]]<-cutter(tests[[i]], .02)
  testsB[[i]]<-binaried(tests[[i]], 20)
  testsSum[[i]]<-summary(tests[[i]]$press,  options(scipen=2))
  }

dancesSum

#Reformat the results so it can be used to match more easily
#could us plyr instead probably better.
sumTable<-function(dancesSum){
  meanss<-c(1:length(dancesSum))
  medianss<-c(1:length(dancesSum))
  for (i in 1:length(dancesSum)){
  meanss[[i]]<-dancesSum[[i]]["Mean"][[1]]
  medianss[[i]]<-dancesSum[[i]]["Median"][[1]]
  }
  MeanTable<-data.frame(names(dancesSum), meanss, medianss)
  colnames(MeanTable)<-c("dance", "Mean", "Median")
  return(MeanTable)
  }
danceTable<-sumTable(dancesSum)

testTable<-sumTable(testsSum)
danceTable$dance<-as.character(danceTable$dance)
for (i in 1:nrow(testTable)){
  testTable$GuessMean[[i]]<-danceTable$dance[[which.min(abs(testTable$Mean[[i]]-danceTable$Mean))]]
  testTable$GuessMedian[[i]]<-danceTable$dance[[which.min(abs(testTable$Median[[i]]-danceTable$Median))]]
}
testTable
#FAILURE!  Everything was higher this time around it seems
#SUMMARY STATS ARE NOT THE WAY TO IDENTIFY THESE!

#See how predictive the frequency estimates are 
#(not super optimistic here--speed should change this)
#(maybe estimates can be used to pick out chunks to try to match though)
#Try it on the regular looking time series data
#Try time series:
testsT<-tests
for (i in 1:length(testsT))
  testsT[[i]]<-ts(testsT[[i]]$press)
#estimates of the autocovariance or autocorrelation function (???)
for (i in 1:length(testsT)){
  jpeg(paste("plots/acf", names(testsT)[[i]], ".jpg", sep=''))
  print(acf(testsT[[i]], main=names(testsT)[[i]]))
  dev.off()
}
#These are maybe useful for matching....not totally consistent though....
#Note: blue lines are 95th confidence cut-offs--so basically all our data are significant

#Try this other numerical thing
testsTmod<-testsT
for (i in 1:length(testsTmod))
testsTmod[[i]]<-ar.yw(testsT[[i]])

#Pull orders
testsTorders<-testsTmod
testsTorders<-data.frame(names(testsTmod), c(1:length(testsTmod)))
colnames(testsTorders)<-c("dancetest", "order")
for (i in 1:nrow(testsTorders)){
  testsTorders$order[[i]]<-testsTmod[[i]]$order
}
#Not great. Maybe order isn't the thing to look at......

#Time series plots
for (i in 1:length(tests)){
  jpeg(paste("plots/TS", names(testsT)[[i]], ".jpg", sep=''))
  plot.ts(tests[[i]]$press, xlim=c(50,250), main=names(tests)[[i]])
  dev.off()
}


#Try it on the regular looking time series data
testsFreq<-testsT
testsFreq2<-data.frame(names(testsFreq), c(1:length(testsFreq)))
colnames(testsFreq2)<-c("dancetest", "Freq")
for (i in 1:length(testsFreq)){
  testsFreq[[i]]<-find.freq(testsT[[i]])
  testsFreq2$Freq[[i]]<-testsFreq[i][[1]]
}

#Clean up to get predictions
testsFreq2$dancetest<-as.character(testsFreq2$dancetest)
testsFreq2$dance<-as.character(testsFreq2$dancetest)
for (i in 1:nrow(testsFreq2)){
testsFreq2$Guess[[i]]<-as.character(dancesFreq2$dance)[[which.min(abs(testsFreq2$Freq[[i]]-dancesFreq2$Freq))]]
testsFreq2$dance[[i]]<-substring(testsFreq2$dancetest[[i]], 1, nchar(testsFreq2$dancetest[[i]])-2)
}
DF2<-dancesFreq2
colnames(DF2)<-c("dance", "officialFreq")
testsFreq2<-merge(testsFreq2, DF2, by="dance", all=TRUE)

#NOT TOO BAD!!!!
#Everything but salsa seemed to match up okay....
#definitely depeds on speed of motion though because irumba with a song got messed up too.
#mmmmmm, not robust.  I want it based on shape of graph.


#More to do here....


#