Modeling pupil attendance during a deworming initiative using SVDs and HMMs

Context

worms

Many children in developing countries are impacted by worm parrasites. These worms are usually picked up through kids' skin tissue. In children's bodies, these worms have many adverse effects including consuming children's nutrition.

Having worms can dramatically alter a child's life. Worms have been shown to lead to significantly decreased school attendance, in turn causing reduced employment and salaries in adulthood, among other effects.

Project Proposal

I want to model to try and predict whether students will attend school or not. Starting with an SVD and then using HMMs. Students will have a hidden state and Observations are available for students in the form of school attendance. The hypothesis is that students in certain states or with certain levels of illness will attend school less and this can be used to infer their hidden state/illness. Once a model is learned the Viterbi algorithm can be used to determine the most likely states.

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [4]:
Attendance = pd.read_stata("data/namelist.dta")
#print(Attendance.iloc[0:3])
Attendance = Attendance.set_index('pupid')

SchoolVar = pd.read_stata("data/schoolvar.dta")
#print(SchoolVar.dtypes)

Comply = pd.read_stata("data/comply.dta")
Comply[['pupid']] = Comply[['pupid']].apply(np.int64)
Comply = Comply.set_index('pupid')

wormed = pd.read_stata("data/wormed.dta")
#print(pupq.iloc[0:2])
wormed = wormed.set_index('pupid')

#worms = pd.read_csv("ted_miguel_worms.csv")
#print(worms.iloc[:5])
In [5]:
visit =  Attendance['visit']
visitCategories = visit.cat.categories
schoolIds = SchoolVar['schid']
visitNum = [cat for cat in visitCategories]
groups = range(1,4)
In [906]:
meanAttendanceDay = np.zeros((3, len(visitNum), 2))

for i in range(len(groups)):
    for j in range(len(visitNum)):
        val = Attendance[(Attendance["visit"]==visitNum[j]) & (Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
        meanAttendanceDay[i][j] = (val['prs'].mean(), val['date'].mean())
        
        
plt.figure()
plt.title("mean attendance by visit for different groups in the study")

for i in range(len(groups)):
    plt.plot(meanAttendanceDay[i,:,1], meanAttendanceDay[i,:,0], label="group"+str(i+1))
    plt.legend()
    plt.xlabel("day")
    
#plt.show()
In [805]:
meanAttendanceSchoolDay = np.zeros((3, len(visitNum)*len(schoolIds), 3))

for i in range(len(groups)):
    for j in range(len(visitNum)):
        for k in range(len(schoolIds)):
            val = Attendance[(Attendance["visit"]==visitNum[j]) & (Attendance['schid']==schoolIds[k]) & (Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
            meanAttendanceSchoolDay[i][j*len(schoolIds) + k] = (val['prs'].mean(), val['date'].mean(), schoolIds[k])
In [806]:
plt.figure()
plt.title("mean attendance by visit for different groups in the study, by school")

for i in range(len(groups)):
    plt.scatter(meanAttendanceSchoolDay[i,:,1], meanAttendanceSchoolDay[i,:,0], label="group"+str(i+1), color=['r','g','b'][i])
    plt.legend()
    
plt.xlabel("day")
    
plt.show()
In [957]:
plt.figure()
for i in range(len(groups)):
    groupAttendance = Attendance[(Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
    #print groupAttendance['date']
    plt.scatter(groupAttendance['date'][:10000], groupAttendance['prs'][:10000], color=['r','g','b'][i])

plt.xlabel("day")
plt.show()
In [808]:
print Comply.columns
Index([u'sch98v1', u'sch981c', u'std981c', u'any98', u'a981', u'whyno981',
       u'a982', u'whyno982', u'p98', u'whyno98p', u'psch98', u'sch991c',
       u'std991c', u'consnt99', u'any99', u'a991', u'whyno991', u'a992',
       u'whyno992', u'p99', u'psch99'],
      dtype='object')
In [809]:
print Attendance.columns
Index([u'dupid', u'visit', u'schid', u'std', u'sex', u'obs', u'prs',
       u'totobs98', u'totprs98', u'totpar98', u'totobs99', u'totprs99',
       u'totpar99', u'totobs', u'totprs', u'totpart', u'sch98v1', u'wgrp',
       u'wgrp1', u'wgrp2', u'wgrp3', u'yrbirth', u'elg98', u'elg99', u'stdgap',
       u'sap1', u'sap2', u'sap3', u'sap4', u'std98v1', u'date', u'Tmonths',
       u'Isem1', u'Isem2', u'Isem3'],
      dtype='object')
In [ ]:
 

Prediciting pupil presence on the eight visit using past attendance, age, sex w/ SVD

In [886]:
def dropNullCat(data, cats=[], all=False):
    if all:
        cats = data.columns.values
    
    if type(cats) is str:
        mask = data[cats].notnull()
    else:
        mask = data[cats[0]].notnull()
        for i in range(1,len(cats)):
            mask = mask&(data[cats[i]].notnull())  
        
    return data[mask]
    
def ObsContainPeriods(data, periods, dropOther=True):
    cats=visitCategories[periods]
    data2=dropNullCat(data, cats)
    drops = []
    
    if dropOther:
        drops = [cat for cat in visitCategories if cat not in cats and cat in data2]
    return data2.drop(drops, axis=1)
In [811]:
pivotAttendanceByPupid = pd.pivot_table(Attendance,index='pupid', values='prs', columns=['visit'])
In [821]:
#genAttendanceByPupid = Attendance.groupby('pupid').first().drop(['visit','obs','prs','wgrp','date','std','Tmonths'], axis=1)
genAttendanceByPupidWhole = Attendance.groupby('pupid').first()
genAttendanceByPupid = genAttendanceByPupidWhole[['wgrp1', 'wgrp2', 'sex', 'yrbirth']]
In [813]:
AttendanceByPupid = genAttendanceByPupid.join(pivotAttendanceByPupid)
In [814]:
def maxObs(data, num, i=0):
    if num==0:
        return (len(data), [])
    elif i==len(visitCategories)-num:
        return (0, [])
    else:
        drop = maxObs(dropNullCat(data,visitCategories[i]), num-1, i+1)
        keep = maxObs(data, num, i+1)
        if drop[0] > keep[0]:
            return (drop[0], drop[1]+[i])
        else:
            return keep
        
print maxObs(AttendanceByPupid, 9)
(6797, [14, 13, 12, 11, 8, 6, 4, 3, 2])
In [815]:
AttendanceByPupidHighConsistentObs = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14])

print AttendanceByPupidHighConsistentObs.mean()-AttendanceByPupid[AttendanceByPupidHighConsistentObs.columns.values].mean()

svdData = dropNullCat(AttendanceByPupidHighConsistentObs,['sex','yrbirth'])

print 'svdDataLen:', len(svdData)
wgrp1            0.021399
wgrp2           -0.061368
sex             -0.026783
yrbirth         -0.389213
Visit 3, 1998   -0.003816
Visit 4, 1998    0.031066
Visit 5, 1998    0.049097
Visit 7, 1998    0.022275
Visit 1, 1999    0.056709
Visit 4, 1999    0.060454
Visit 5, 1999    0.060599
Visit 6, 1999    0.064430
Visit 7, 1999    0.075109
dtype: float64
svdDataLen: 5883
In [816]:
print sum(genAttendanceByPupidWhole['totobs']==8)
9113
In [891]:
y = svdData['Visit 7, 1999']

X = svdData.drop('Visit 7, 1999', axis=1)
X['offset'] = 1.

b = np.linalg.pinv(X).dot(y)

First, how well does guessian the median past attendance work?

In [945]:
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
avg sum squared error with median: 0.232896866265
In [896]:
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print 'n\n coefficients:'
for i in range(len(b)):
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.120719561319
avg sum squared error with median: 0.195138534761
n
 coefficients:
wgrp1 coefficient: -0.060147
wgrp2 coefficient: 0.004408
sex coefficient: 0.003398
yrbirth coefficient: -0.006324
Visit 3, 1998 coefficient: -0.089036
Visit 4, 1998 coefficient: -0.021038
Visit 5, 1998 coefficient: 0.112973
Visit 7, 1998 coefficient: 0.111699
Visit 1, 1999 coefficient: 0.110015
Visit 4, 1999 coefficient: 0.137728
Visit 5, 1999 coefficient: 0.255665
Visit 6, 1999 coefficient: 0.193232
offset coefficient: 12.708935
In [895]:
for i in range(1,8):
    X2 = ObsContainPeriods(X, [2,3,4,6,8,11,12,13][i:])    
    X2.dtypes
    b2 = np.linalg.pinv(X2).dot(y)
    print '\n\nSVD with past %d observations' % (8-i)
    print 'avg sum squared error with SVD:', sum(abs(X2.dot(b2)-y)**2)/len(y)
    print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)

SVD with past 7 observations
avg sum squared error with SVD: 0.121226852116
avg sum squared error with median: 0.195138534761


SVD with past 6 observations
avg sum squared error with SVD: 0.121227254103
avg sum squared error with median: 0.195138534761


SVD with past 5 observations
avg sum squared error with SVD: 0.121839760431
avg sum squared error with median: 0.195138534761


SVD with past 4 observations
avg sum squared error with SVD: 0.122019625698
avg sum squared error with median: 0.195138534761


SVD with past 3 observations
avg sum squared error with SVD: 0.123380191082
avg sum squared error with median: 0.195138534761


SVD with past 2 observations
avg sum squared error with SVD: 0.126185372082
avg sum squared error with median: 0.195138534761


SVD with past 1 observations
avg sum squared error with SVD: 0.138348475639
avg sum squared error with median: 0.195138534761
In [828]:
ComplyPupidIndex = Comply[['a981','a982','a991','a992']]

ComplyObsHigh = ComplyPupidIndex.loc[svdData.index]

print len(ComplyObsHigh.index)
print len(svdData.index)

svdData2 = svdData.join(ComplyObsHigh)

svdData2 = dropNullCat(svdData2, all=True)

print len(svdData2)
5890
5883
5865

Including past infection rates in predicting attendance w/ SVD

In [900]:
y = svdData2['Visit 7, 1999']

X = svdData2.drop('Visit 7, 1999', axis=1)
X['offset'] = 1.

b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
for i in range(len(b)):
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.119677635849
avg sum squared error with median: 0.19505541347


 coefficients:
wgrp1 coefficient: -0.092847
wgrp2 coefficient: -0.047939
sex coefficient: -0.009927
yrbirth coefficient: -0.010686
Visit 3, 1998 coefficient: -0.084467
Visit 4, 1998 coefficient: -0.022731
Visit 5, 1998 coefficient: 0.116203
Visit 7, 1998 coefficient: 0.103322
Visit 1, 1999 coefficient: 0.105596
Visit 4, 1999 coefficient: 0.132274
Visit 5, 1999 coefficient: 0.249723
Visit 6, 1999 coefficient: 0.192473
a981 coefficient: 0.001462
a982 coefficient: -0.020215
a991 coefficient: 0.003360
a992 coefficient: 0.083579
offset coefficient: 21.391294
In [830]:
covariance = np.cov(X.T)
eigenvalues, M = np.linalg.eig(covariance)
M = M.T
In [831]:
diagonalizedCovariance = M.dot(covariance.dot(M.T))
In [832]:
print eigenvalues
[ 6.27420936  0.63837473  0.38791665  0.26934667  0.23118722  0.15377697
  0.00989239  0.12863607  0.11150248  0.03729799  0.04489787  0.04990477
  0.06357035  0.07697328  0.09023614  0.08182559  0.        ]
In [833]:
schools2 = SchoolVar.set_index('schid')[['pup_pop','pop1_1km_updated', 'pop1_2km_updated', 'pop1_3km_updated', 'pop2_1km_updated', 'pop2_2km_updated', 'pop2_3km_updated', 'popT_2km_updated', 'popT_3km_updated', 'popT_1km_updated']]
print len(svdData2)
svdData3 = svdData.join(genAttendanceByPupidWhole.loc[svdData2.index]['schid'], how='left')
print len(svdData3)
svdData3 = svdData3.join(schools2, on='schid', how='left')

svdData3 = dropNullCat(svdData3, all=True)
print 'len change:\n', len(svdData2)
print len(svdData3)
print len(genAttendanceByPupidWhole.loc[svdData2.index]['schid'])
5865
5890
len change:
5865
5865
5865
In [834]:
"""
X = svdData3.loc[:,['pup_pop','pop1_1km_updated', 'pop1_2km_updated', 'pop1_3km_updated', 'pop2_1km_updated', 'pop2_2km_updated', 'pop2_3km_updated', 'popT_1km_updated', 'popT_2km_updated', 'popT_3km_updated']]
X['offset'] = 1.
y = svdData3['Visit 7, 1999']


b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with mean:', sum(abs(y-np.mean(y))**2)/len(y)
print '\n\n coefficients:'

sortedB = np.argsort(np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
"""
print 

Including treatment group populations in adjacent areas w/ SVD and changing visit being predicted

In [901]:
y = svdData3['Visit 7, 1999']

X = svdData3.drop(['schid','Visit 7, 1999'], axis=1)

X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']

X = X.drop(['wgrp1','wgrp2'], axis=1)

X = X/(X.max() - X.min())

X['offset'] = 1.

b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'

sortedB = np.argsort(-np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.119342607142
avg sum squared error with median: 0.19505541347


 coefficients:
offset coefficient: 15.992976
Visit 5, 1999 coefficient: 0.256780
Visit 6, 1999 coefficient: 0.189394
popT_3km_updated coefficient: 0.172406
Visit 5, 1998 coefficient: 0.161055
yrbirth coefficient: -0.152438
Visit 4, 1999 coefficient: 0.127243
Visit 3, 1998 coefficient: -0.119252
Visit 1, 1999 coefficient: 0.116205
Visit 7, 1998 coefficient: 0.101858
pop2_3km_updated coefficient: -0.098802
pup_pop coefficient: 0.086764
pop1_3km_updated coefficient: -0.082438
pup_pop_wgrp1 coefficient: -0.075526
popT_1km_updated coefficient: 0.062056
pop1_1km_updated coefficient: 0.057240
popT_2km_updated coefficient: -0.036314
pop2_2km_updated coefficient: 0.036105
Visit 4, 1998 coefficient: -0.034579
pop1_2km_updated coefficient: 0.020103
pup_pop_wgrp2 coefficient: 0.013087
sex coefficient: 0.010648
pop2_1km_updated coefficient: 0.008779
In [902]:
y = X['Visit 6, 1999']

X = X.drop(['Visit 6, 1999'], axis=1)


b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'

sortedB = np.argsort(-np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.170315607489
avg sum squared error with median: 0.284228473998


 coefficients:
offset coefficient: -1.604112
pop2_2km_updated coefficient: -0.232876
Visit 5, 1999 coefficient: 0.225927
Visit 4, 1999 coefficient: 0.217932
pop2_3km_updated coefficient: 0.167274
pop1_3km_updated coefficient: -0.123432
Visit 1, 1999 coefficient: 0.118565
Visit 3, 1998 coefficient: 0.114713
pop2_1km_updated coefficient: 0.106175
Visit 7, 1998 coefficient: 0.089696
popT_2km_updated coefficient: 0.085384
popT_1km_updated coefficient: 0.081148
pop1_2km_updated coefficient: 0.079572
pup_pop_wgrp2 coefficient: 0.079104
Visit 4, 1998 coefficient: 0.063099
pup_pop coefficient: -0.061332
pup_pop_wgrp1 coefficient: -0.059835
Visit 5, 1998 coefficient: 0.042724
popT_3km_updated coefficient: 0.042416
pop1_1km_updated coefficient: -0.026889
yrbirth coefficient: 0.014936
sex coefficient: 0.003194
In [903]:
y = X['Visit 5, 1999']

X = X.drop(['Visit 5, 1999'], axis=1)

b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'

sortedB = np.argsort(-np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.0976138007945
avg sum squared error with median: 0.138277919864


 coefficients:
offset coefficient: -19.510351
Visit 4, 1999 coefficient: 0.262555
Visit 1, 1999 coefficient: 0.216105
yrbirth coefficient: 0.189165
Visit 7, 1998 coefficient: 0.170643
pop2_3km_updated coefficient: 0.087986
pup_pop_wgrp1 coefficient: -0.064266
pop2_2km_updated coefficient: -0.056889
pup_pop_wgrp2 coefficient: -0.056782
pup_pop coefficient: 0.055877
popT_3km_updated coefficient: -0.053596
Visit 3, 1998 coefficient: -0.042862
pop1_2km_updated coefficient: 0.035606
popT_1km_updated coefficient: 0.031010
Visit 5, 1998 coefficient: 0.030802
Visit 4, 1998 coefficient: 0.027874
pop2_1km_updated coefficient: 0.020766
popT_2km_updated coefficient: 0.011504
pop1_1km_updated coefficient: 0.011005
pop1_3km_updated coefficient: -0.008261
sex coefficient: 0.003867
In [838]:
covariance = np.cov(X.T)
eigenvalues, M = np.linalg.eig(covariance)
M = M.T

Almost all eigenvalues are non-zero, low correlation

In [839]:
print eigenvalues
[  3.63001145e-01   2.48355840e-01   1.99177150e-01   1.68684671e-01
   1.42509823e-01   1.17823959e-01   9.19776106e-02   8.84578433e-02
   7.43871315e-02   4.40113863e-02   3.85067477e-02   2.93271396e-02
   2.78771958e-02   2.04929895e-02   2.26386160e-03   1.43866218e-02
   7.12759630e-03   8.24487073e-03   9.77177139e-03   1.12639435e-16
   0.00000000e+00]

Maybe variations in date matter. regress on date difference?

In [845]:
AttendancePupidVisitIndex = Attendance.set_index('visit', append=True)
pivotAttendanceDateByPupid = pd.pivot_table(Attendance,index='pupid', values=['date'], columns=['visit'])

#pivotAttendanceDateByPupid.rename(columns={cat:cat+'_date' for cat in visitCategories}, inplace=True)
In [846]:
unionCols = set(pivotAttendanceByPupid.columns.values).intersection(set(svdData3.columns.values))
pivotAttendanceDateByPupid2 = pivotAttendanceDateByPupid[[('date',cat) for cat in unionCols]]

svdData4 = svdData3.join(pivotAttendanceDateByPupid2)

svdData4 = dropNullCat(svdData4, all=True)
In [904]:
y = svdData4['Visit 7, 1999']

X = svdData4.drop(['schid'], axis=1)

X = X.drop(['Visit 7, 1999'], axis=1)

for col in X.columns.values:
    if type(col)==tuple and col[1]!='Visit 7, 1999':
        val = X[('date','Visit 7, 1999')]-X[col]
        val = val**3
        X[col[1]+'_mult'] = val*X[col[1]]
        

X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']

X = X.drop(['wgrp1','wgrp2'], axis=1)

X = X/(X.max() - X.min())

X['offset'] = 1.

b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'

sortedB = np.argsort(-np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.116119820004
avg sum squared error with median: 0.198094553299


 coefficients:
offset coefficient: 76.923166
Visit 5, 1998_mult coefficient: -1.958619
Visit 7, 1998_mult coefficient: -1.760807
Visit 5, 1998 coefficient: 1.745389
Visit 7, 1998 coefficient: 1.711972
Visit 4, 1998_mult coefficient: 0.977430
Visit 4, 1998 coefficient: -0.904266
('date', 'Visit 5, 1998') coefficient: -0.655341
Visit 3, 1998_mult coefficient: -0.610665
('date', 'Visit 6, 1999') coefficient: 0.557112
Visit 3, 1998 coefficient: 0.472582
popT_3km_updated coefficient: 0.466890
('date', 'Visit 4, 1998') coefficient: 0.432939
('date', 'Visit 7, 1998') coefficient: -0.396747
('date', 'Visit 3, 1998') coefficient: -0.349780
pop2_3km_updated coefficient: -0.271923
Visit 5, 1999 coefficient: 0.254670
('date', 'Visit 7, 1999') coefficient: 0.254422
Visit 6, 1999_mult coefficient: 0.240224
Visit 1, 1999 coefficient: 0.221783
pop1_3km_updated coefficient: -0.201223
popT_2km_updated coefficient: -0.198447
pop2_2km_updated coefficient: 0.172825
yrbirth coefficient: -0.158251
Visit 6, 1999 coefficient: 0.150242
Visit 4, 1999 coefficient: 0.136243
Visit 1, 1999_mult coefficient: -0.131469
('date', 'Visit 5, 1999') coefficient: -0.120279
pup_pop coefficient: 0.099519
pop1_1km_updated coefficient: 0.096385
('date', 'Visit 4, 1999') coefficient: -0.080240
pup_pop_wgrp1 coefficient: -0.066700
popT_1km_updated coefficient: 0.066012
Visit 4, 1999_mult coefficient: -0.065892
('date', 'Visit 1, 1999') coefficient: 0.027954
pop2_1km_updated coefficient: -0.023700
pop1_2km_updated coefficient: -0.015318
pup_pop_wgrp2 coefficient: 0.011577
sex coefficient: 0.010201
Visit 5, 1999_mult coefficient: 0.006263

How about using Radial Basis Functions on date difference?

In [951]:
y = svdData4['Visit 7, 1999']

X = svdData4.drop(['schid'], axis=1)

X = X.drop(['Visit 7, 1999'], axis=1)

for col in X.columns.values:
    if type(col)==tuple and col[1]!='Visit 7, 1999':
        val = (X[('date','Visit 7, 1999')]-X[col])**3
        val = val**3
        X[col[1]+'_mult'] = val*X[col[1]]
        

X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']

X = X.drop(['wgrp1','wgrp2'], axis=1)

X = X/(X.max() - X.min())

X['offset'] = 1.

b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-np.round(y))**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)

print '\n\ncoefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
    print '%s coefficient: %f' % (X.columns.values[i], b[i])
avg sum squared error with SVD: 0.115678369394
avg sum squared error with median: 0.198094553299


coefficients:
offset coefficient: 22.361419
Visit 7, 1998_mult coefficient: -0.789098
Visit 5, 1998_mult coefficient: -0.784366
('date', 'Visit 7, 1999') coefficient: 0.716054
Visit 7, 1998 coefficient: 0.700368
('date', 'Visit 5, 1998') coefficient: -0.682095
Visit 5, 1998 coefficient: 0.576131
('date', 'Visit 6, 1999') coefficient: 0.557265
popT_3km_updated coefficient: 0.523652
('date', 'Visit 3, 1998') coefficient: -0.470896
Visit 3, 1998_mult coefficient: -0.460726
('date', 'Visit 7, 1998') coefficient: -0.404381
('date', 'Visit 4, 1998') coefficient: 0.377996
pop2_3km_updated coefficient: -0.308346
Visit 4, 1998_mult coefficient: 0.290614
pop1_3km_updated coefficient: -0.267703
Visit 4, 1998 coefficient: -0.253049
Visit 5, 1999 coefficient: 0.251657
Visit 1, 1999_mult coefficient: -0.239052
Visit 1, 1999 coefficient: 0.227458
popT_2km_updated coefficient: -0.194315
Visit 3, 1998 coefficient: 0.188964
Visit 6, 1999 coefficient: 0.176237
yrbirth coefficient: -0.161379
pop2_2km_updated coefficient: 0.155522
Visit 4, 1999_mult coefficient: -0.126097
Visit 4, 1999 coefficient: 0.122657
('date', 'Visit 4, 1999') coefficient: -0.118419
('date', 'Visit 5, 1999') coefficient: -0.116550
pup_pop coefficient: 0.093169
pop1_1km_updated coefficient: 0.092622
('date', 'Visit 1, 1999') coefficient: -0.072087
pup_pop_wgrp1 coefficient: -0.063076
popT_1km_updated coefficient: 0.059719
Visit 6, 1999_mult coefficient: -0.030950
Visit 5, 1999_mult coefficient: 0.030108
pop2_1km_updated coefficient: -0.026490
pup_pop_wgrp2 coefficient: 0.021982
pop1_2km_updated coefficient: -0.012361
sex coefficient: 0.009721

Maybe there is a hidden state we are not observing. Let's attempt using HMMs.

In [849]:
pTrans = np.array([[0.9,0.1],[0.1,0.9]])
pObs = np.array([[0.8,0.4],[0.2,0.6]])

def viterbi(y, pT=pTrans, pO=pObs, pInit=None, logProb=True):
    pLogState = np.zeros((len(y), len(pT)))
    backtrack = np.zeros((len(y), len(pT)), dtype=int)
    if pInit!=None:
        pLogState[0] = pInit
    else:
        pLogState[0] = -float('inf')
        pLogState[0,0] = 0
    
    logTrans = np.log(pTrans)
    logObs = np.log(pO)
    logZeroObs = np.zeros((1,len(pT)))
    
    for i in range(1,len(y)):
        if y[i]>-1:
            pLogY = logObs[y[i]:y[i]+1]
        else:
            pLogY = logZeroObs
            
        process = logTrans + pLogState[i-1:i].T + pLogY
        backtrack[i] = np.argmax(process,axis=0)
        if logProb:
            pLogState[i] = np.sum(process, axis=0)
        else:
            pLogState[i] = np.max(process, axis=0)
    
    guess = np.zeros(len(y), dtype=int)
    state = np.argmax(pLogState[-1])
    
    for i in range(1, len(y)+1):
        guess[-i] = state
        state = backtrack[-i, state]
    
    return guess, np.max(pLogState[-1])
In [850]:
pTransCoin = [0.1, 0.1]

pHeadCoin  = [0.2, 0.6]

def observeCoin(nums, pT=pTransCoin, pH=pHeadCoin):
    state = 0
    states = []
    observations = np.zeros(nums, dtype=int)
    
    for i in range(nums):
        states.append(state)
        if pH[state] > np.random.rand():
            observations[i]=1
        else:
            observations[i]=0
            
        if pT[state] > np.random.rand():
            state = 1-state
    
    return states, observations
In [853]:
sCoin, obsCoin = observeCoin(1000)
0

guess, maxVal = viterbi(obsCoin)

#print np.mean(guess==sCoin)

"""
Y = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14], dropOther=False)[visitCategories]

Y = np.array(Y, dtype=np.int64)
Y[Y<-1] = 2

pI = pInit=np.log(0.5*np.ones((1,2)))

def hmmError(pT, pO):
    return sum([viterbi(y, pT=pT, pO=pO, pInit=pI)[1] for y in Y])
    

rangeVals = np.arange(0.1,0.9,0.2)
    
def searchProbs():
    best = [0,0,0,0]
    minError = float('inf')

    i=0
    for pTrans1 in rangeVals:
        for pTrans2 in rangeVals:
            for pObs1 in rangeVals:
                for pObs2 in rangeVals:
                    pT = np.array([[pTrans1, 1-pTrans1],[1-pTrans2, pTrans2]])
                    pO = np.array([[1-pObs1, 1-pObs2],[pObs1, pObs2]])
                    error = hmmError(pT, pO)
                    if error < minError:
                        minError = error
                        best = [pTrans1, pTrans2, pObs1, pObs2]
                    elif abs(error-minError)<1e-9:
                        print "match"
                        print best
                        print [pTrans1, pTrans2, pObs1, pObs2]
                        print error
    return best, minError
"""
print 

In [854]:
from hmmlearn import hmm
In [930]:
remodel = hmm.MultinomialHMM(n_components=15)
In [931]:
Y[Y<0] = 2

hmmData = np.concatenate([Y[i:i+1].T for i in range(len(Y))])
length = [len(Y[i]) for i in range(len(Y))]
In [932]:
model = remodel.fit(hmmData, length)
In [933]:
hmmData2 = np.concatenate([Y[i:i+1,:-1].T for i in range(len(Y))])
length2 = [len(Y[i])-1 for i in range(len(Y))]
print hmmData2.shape
print len(length2)
(101955, 1)
6797
In [934]:
predict = remodel.predict(hmmData2, length2)
In [935]:
Y2 = np.array([predict[i*length2[0]+length2[0]-1] for i in range(len(length2))])
In [936]:
y = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14], dropOther=False)['Visit 7, 1999']
In [949]:
obs = remodel.transmat_[Y2].dot(remodel.emissionprob_)

guess2 = np.argmax(obs[:,:2], axis=1)

print "HMM mean squared error", np.mean(np.abs(y-guess2)**2)
print "Median guess mean squared error", np.mean(np.abs(y-1)**2)
HMM mean squared error 0.232896866265
Median guess mean squared error 0.232896866265

HMMs work as well as guessing the median past attendance

In [ ]:
 
In [ ]: