Many children in developing countries are impacted by worm parrasites. These worms are usually picked up through kids' skin tissue. In children's bodies, these worms have many adverse effects including consuming children's nutrition.
Having worms can dramatically alter a child's life. Worms have been shown to lead to significantly decreased school attendance, in turn causing reduced employment and salaries in adulthood, among other effects.
I want to model to try and predict whether students will attend school or not. Starting with an SVD and then using HMMs. Students will have a hidden state and Observations are available for students in the form of school attendance. The hypothesis is that students in certain states or with certain levels of illness will attend school less and this can be used to infer their hidden state/illness. Once a model is learned the Viterbi algorithm can be used to determine the most likely states.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
Attendance = pd.read_stata("data/namelist.dta")
#print(Attendance.iloc[0:3])
Attendance = Attendance.set_index('pupid')
SchoolVar = pd.read_stata("data/schoolvar.dta")
#print(SchoolVar.dtypes)
Comply = pd.read_stata("data/comply.dta")
Comply[['pupid']] = Comply[['pupid']].apply(np.int64)
Comply = Comply.set_index('pupid')
wormed = pd.read_stata("data/wormed.dta")
#print(pupq.iloc[0:2])
wormed = wormed.set_index('pupid')
#worms = pd.read_csv("ted_miguel_worms.csv")
#print(worms.iloc[:5])
visit = Attendance['visit']
visitCategories = visit.cat.categories
schoolIds = SchoolVar['schid']
visitNum = [cat for cat in visitCategories]
groups = range(1,4)
meanAttendanceDay = np.zeros((3, len(visitNum), 2))
for i in range(len(groups)):
for j in range(len(visitNum)):
val = Attendance[(Attendance["visit"]==visitNum[j]) & (Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
meanAttendanceDay[i][j] = (val['prs'].mean(), val['date'].mean())
plt.figure()
plt.title("mean attendance by visit for different groups in the study")
for i in range(len(groups)):
plt.plot(meanAttendanceDay[i,:,1], meanAttendanceDay[i,:,0], label="group"+str(i+1))
plt.legend()
plt.xlabel("day")
#plt.show()
meanAttendanceSchoolDay = np.zeros((3, len(visitNum)*len(schoolIds), 3))
for i in range(len(groups)):
for j in range(len(visitNum)):
for k in range(len(schoolIds)):
val = Attendance[(Attendance["visit"]==visitNum[j]) & (Attendance['schid']==schoolIds[k]) & (Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
meanAttendanceSchoolDay[i][j*len(schoolIds) + k] = (val['prs'].mean(), val['date'].mean(), schoolIds[k])
plt.figure()
plt.title("mean attendance by visit for different groups in the study, by school")
for i in range(len(groups)):
plt.scatter(meanAttendanceSchoolDay[i,:,1], meanAttendanceSchoolDay[i,:,0], label="group"+str(i+1), color=['r','g','b'][i])
plt.legend()
plt.xlabel("day")
plt.show()
plt.figure()
for i in range(len(groups)):
groupAttendance = Attendance[(Attendance["wgrp"]==groups[i]) & (Attendance["obs"]==1)]
#print groupAttendance['date']
plt.scatter(groupAttendance['date'][:10000], groupAttendance['prs'][:10000], color=['r','g','b'][i])
plt.xlabel("day")
plt.show()
print Comply.columns
print Attendance.columns
def dropNullCat(data, cats=[], all=False):
if all:
cats = data.columns.values
if type(cats) is str:
mask = data[cats].notnull()
else:
mask = data[cats[0]].notnull()
for i in range(1,len(cats)):
mask = mask&(data[cats[i]].notnull())
return data[mask]
def ObsContainPeriods(data, periods, dropOther=True):
cats=visitCategories[periods]
data2=dropNullCat(data, cats)
drops = []
if dropOther:
drops = [cat for cat in visitCategories if cat not in cats and cat in data2]
return data2.drop(drops, axis=1)
pivotAttendanceByPupid = pd.pivot_table(Attendance,index='pupid', values='prs', columns=['visit'])
#genAttendanceByPupid = Attendance.groupby('pupid').first().drop(['visit','obs','prs','wgrp','date','std','Tmonths'], axis=1)
genAttendanceByPupidWhole = Attendance.groupby('pupid').first()
genAttendanceByPupid = genAttendanceByPupidWhole[['wgrp1', 'wgrp2', 'sex', 'yrbirth']]
AttendanceByPupid = genAttendanceByPupid.join(pivotAttendanceByPupid)
def maxObs(data, num, i=0):
if num==0:
return (len(data), [])
elif i==len(visitCategories)-num:
return (0, [])
else:
drop = maxObs(dropNullCat(data,visitCategories[i]), num-1, i+1)
keep = maxObs(data, num, i+1)
if drop[0] > keep[0]:
return (drop[0], drop[1]+[i])
else:
return keep
print maxObs(AttendanceByPupid, 9)
AttendanceByPupidHighConsistentObs = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14])
print AttendanceByPupidHighConsistentObs.mean()-AttendanceByPupid[AttendanceByPupidHighConsistentObs.columns.values].mean()
svdData = dropNullCat(AttendanceByPupidHighConsistentObs,['sex','yrbirth'])
print 'svdDataLen:', len(svdData)
print sum(genAttendanceByPupidWhole['totobs']==8)
y = svdData['Visit 7, 1999']
X = svdData.drop('Visit 7, 1999', axis=1)
X['offset'] = 1.
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print 'n\n coefficients:'
for i in range(len(b)):
print '%s coefficient: %f' % (X.columns.values[i], b[i])
for i in range(1,8):
X2 = ObsContainPeriods(X, [2,3,4,6,8,11,12,13][i:])
X2.dtypes
b2 = np.linalg.pinv(X2).dot(y)
print '\n\nSVD with past %d observations' % (8-i)
print 'avg sum squared error with SVD:', sum(abs(X2.dot(b2)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
ComplyPupidIndex = Comply[['a981','a982','a991','a992']]
ComplyObsHigh = ComplyPupidIndex.loc[svdData.index]
print len(ComplyObsHigh.index)
print len(svdData.index)
svdData2 = svdData.join(ComplyObsHigh)
svdData2 = dropNullCat(svdData2, all=True)
print len(svdData2)
y = svdData2['Visit 7, 1999']
X = svdData2.drop('Visit 7, 1999', axis=1)
X['offset'] = 1.
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
for i in range(len(b)):
print '%s coefficient: %f' % (X.columns.values[i], b[i])
covariance = np.cov(X.T)
eigenvalues, M = np.linalg.eig(covariance)
M = M.T
diagonalizedCovariance = M.dot(covariance.dot(M.T))
print eigenvalues
schools2 = SchoolVar.set_index('schid')[['pup_pop','pop1_1km_updated', 'pop1_2km_updated', 'pop1_3km_updated', 'pop2_1km_updated', 'pop2_2km_updated', 'pop2_3km_updated', 'popT_2km_updated', 'popT_3km_updated', 'popT_1km_updated']]
print len(svdData2)
svdData3 = svdData.join(genAttendanceByPupidWhole.loc[svdData2.index]['schid'], how='left')
print len(svdData3)
svdData3 = svdData3.join(schools2, on='schid', how='left')
svdData3 = dropNullCat(svdData3, all=True)
print 'len change:\n', len(svdData2)
print len(svdData3)
print len(genAttendanceByPupidWhole.loc[svdData2.index]['schid'])
"""
X = svdData3.loc[:,['pup_pop','pop1_1km_updated', 'pop1_2km_updated', 'pop1_3km_updated', 'pop2_1km_updated', 'pop2_2km_updated', 'pop2_3km_updated', 'popT_1km_updated', 'popT_2km_updated', 'popT_3km_updated']]
X['offset'] = 1.
y = svdData3['Visit 7, 1999']
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with mean:', sum(abs(y-np.mean(y))**2)/len(y)
print '\n\n coefficients:'
sortedB = np.argsort(np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
"""
print
y = svdData3['Visit 7, 1999']
X = svdData3.drop(['schid','Visit 7, 1999'], axis=1)
X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']
X = X.drop(['wgrp1','wgrp2'], axis=1)
X = X/(X.max() - X.min())
X['offset'] = 1.
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
y = X['Visit 6, 1999']
X = X.drop(['Visit 6, 1999'], axis=1)
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
y = X['Visit 5, 1999']
X = X.drop(['Visit 5, 1999'], axis=1)
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
covariance = np.cov(X.T)
eigenvalues, M = np.linalg.eig(covariance)
M = M.T
print eigenvalues
AttendancePupidVisitIndex = Attendance.set_index('visit', append=True)
pivotAttendanceDateByPupid = pd.pivot_table(Attendance,index='pupid', values=['date'], columns=['visit'])
#pivotAttendanceDateByPupid.rename(columns={cat:cat+'_date' for cat in visitCategories}, inplace=True)
unionCols = set(pivotAttendanceByPupid.columns.values).intersection(set(svdData3.columns.values))
pivotAttendanceDateByPupid2 = pivotAttendanceDateByPupid[[('date',cat) for cat in unionCols]]
svdData4 = svdData3.join(pivotAttendanceDateByPupid2)
svdData4 = dropNullCat(svdData4, all=True)
y = svdData4['Visit 7, 1999']
X = svdData4.drop(['schid'], axis=1)
X = X.drop(['Visit 7, 1999'], axis=1)
for col in X.columns.values:
if type(col)==tuple and col[1]!='Visit 7, 1999':
val = X[('date','Visit 7, 1999')]-X[col]
val = val**3
X[col[1]+'_mult'] = val*X[col[1]]
X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']
X = X.drop(['wgrp1','wgrp2'], axis=1)
X = X/(X.max() - X.min())
X['offset'] = 1.
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-y)**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\n coefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
y = svdData4['Visit 7, 1999']
X = svdData4.drop(['schid'], axis=1)
X = X.drop(['Visit 7, 1999'], axis=1)
for col in X.columns.values:
if type(col)==tuple and col[1]!='Visit 7, 1999':
val = (X[('date','Visit 7, 1999')]-X[col])**3
val = val**3
X[col[1]+'_mult'] = val*X[col[1]]
X['pup_pop_wgrp1'] = X['wgrp1']*X['pup_pop']
X['pup_pop_wgrp2'] = X['wgrp2']*X['pup_pop']
X = X.drop(['wgrp1','wgrp2'], axis=1)
X = X/(X.max() - X.min())
X['offset'] = 1.
b = np.linalg.pinv(X).dot(y)
print 'avg sum squared error with SVD:', sum(abs(X.dot(b)-np.round(y))**2)/len(y)
print 'avg sum squared error with median:', sum(abs(y-1)**2)/len(y)
print '\n\ncoefficients:'
sortedB = np.argsort(-np.abs(b))
for i in sortedB:
print '%s coefficient: %f' % (X.columns.values[i], b[i])
pTrans = np.array([[0.9,0.1],[0.1,0.9]])
pObs = np.array([[0.8,0.4],[0.2,0.6]])
def viterbi(y, pT=pTrans, pO=pObs, pInit=None, logProb=True):
pLogState = np.zeros((len(y), len(pT)))
backtrack = np.zeros((len(y), len(pT)), dtype=int)
if pInit!=None:
pLogState[0] = pInit
else:
pLogState[0] = -float('inf')
pLogState[0,0] = 0
logTrans = np.log(pTrans)
logObs = np.log(pO)
logZeroObs = np.zeros((1,len(pT)))
for i in range(1,len(y)):
if y[i]>-1:
pLogY = logObs[y[i]:y[i]+1]
else:
pLogY = logZeroObs
process = logTrans + pLogState[i-1:i].T + pLogY
backtrack[i] = np.argmax(process,axis=0)
if logProb:
pLogState[i] = np.sum(process, axis=0)
else:
pLogState[i] = np.max(process, axis=0)
guess = np.zeros(len(y), dtype=int)
state = np.argmax(pLogState[-1])
for i in range(1, len(y)+1):
guess[-i] = state
state = backtrack[-i, state]
return guess, np.max(pLogState[-1])
pTransCoin = [0.1, 0.1]
pHeadCoin = [0.2, 0.6]
def observeCoin(nums, pT=pTransCoin, pH=pHeadCoin):
state = 0
states = []
observations = np.zeros(nums, dtype=int)
for i in range(nums):
states.append(state)
if pH[state] > np.random.rand():
observations[i]=1
else:
observations[i]=0
if pT[state] > np.random.rand():
state = 1-state
return states, observations
sCoin, obsCoin = observeCoin(1000)
0
guess, maxVal = viterbi(obsCoin)
#print np.mean(guess==sCoin)
"""
Y = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14], dropOther=False)[visitCategories]
Y = np.array(Y, dtype=np.int64)
Y[Y<-1] = 2
pI = pInit=np.log(0.5*np.ones((1,2)))
def hmmError(pT, pO):
return sum([viterbi(y, pT=pT, pO=pO, pInit=pI)[1] for y in Y])
rangeVals = np.arange(0.1,0.9,0.2)
def searchProbs():
best = [0,0,0,0]
minError = float('inf')
i=0
for pTrans1 in rangeVals:
for pTrans2 in rangeVals:
for pObs1 in rangeVals:
for pObs2 in rangeVals:
pT = np.array([[pTrans1, 1-pTrans1],[1-pTrans2, pTrans2]])
pO = np.array([[1-pObs1, 1-pObs2],[pObs1, pObs2]])
error = hmmError(pT, pO)
if error < minError:
minError = error
best = [pTrans1, pTrans2, pObs1, pObs2]
elif abs(error-minError)<1e-9:
print "match"
print best
print [pTrans1, pTrans2, pObs1, pObs2]
print error
return best, minError
"""
print
from hmmlearn import hmm
remodel = hmm.MultinomialHMM(n_components=15)
Y[Y<0] = 2
hmmData = np.concatenate([Y[i:i+1].T for i in range(len(Y))])
length = [len(Y[i]) for i in range(len(Y))]
model = remodel.fit(hmmData, length)
hmmData2 = np.concatenate([Y[i:i+1,:-1].T for i in range(len(Y))])
length2 = [len(Y[i])-1 for i in range(len(Y))]
print hmmData2.shape
print len(length2)
predict = remodel.predict(hmmData2, length2)
Y2 = np.array([predict[i*length2[0]+length2[0]-1] for i in range(len(length2))])
y = ObsContainPeriods(AttendanceByPupid, [2,3,4,6,8,11,12,13,14], dropOther=False)['Visit 7, 1999']
obs = remodel.transmat_[Y2].dot(remodel.emissionprob_)
guess2 = np.argmax(obs[:,:2], axis=1)
print "HMM mean squared error", np.mean(np.abs(y-guess2)**2)
print "Median guess mean squared error", np.mean(np.abs(y-1)**2)