Coverage for larch/math/learn_regress.py: 12%

121 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-10-16 21:04 +0000

1#!/usr/bin/env python 

2""" 

3generalizerd linear models and feature selection using machine learning methods 

4including Partial Least Squares (PLS) and L1-Regularized Linear Modeling (Lasso). 

5 

6These methods are built on the methods from scikit-learn 

7""" 

8import numpy as np 

9 

10try: 

11 from sklearn.cross_decomposition import PLSRegression 

12 from sklearn.model_selection import RepeatedKFold 

13 from sklearn.preprocessing import StandardScaler 

14 from sklearn.linear_model import LassoLarsCV, LassoLars, Lasso 

15 HAS_SKLEARN = True 

16except ImportError: 

17 HAS_SKLEARN = False 

18 

19from .. import Group, isgroup 

20 

21from .utils import interp 

22from .lincombo_fitting import get_arrays, groups2matrix 

23 

24def pls_train(groups, varname='valence', arrayname='norm', scale=True, 

25 ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False, 

26 xmin=-np.inf, xmax=np.inf, **kws): 

27 

28 """use a list of data groups to train a Partial Least Squares model 

29 

30 Arguments 

31 --------- 

32 groups list of groups to use as components 

33 varname name of characteristic value to model ['valence'] 

34 arrayname string of array name to be fit (see Note 3) ['norm'] 

35 xmin x-value for start of fit range [-inf] 

36 xmax x-value for end of fit range [+inf] 

37 scale bool to scale data [True] 

38 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] 

39 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] 

40 skip_cv bool to skip doing Cross-Validation [None] 

41 ncomps number of independent components (See Note 5) [2] 

42 

43 Returns 

44 ------- 

45 group with trained PSLResgession, to be used with pls_predict 

46 

47 Notes 

48 ----- 

49 1. The group members for the components must match each other 

50 in data content and array names. 

51 2. all grouops must have an attribute (scalar value) for `varname` 

52 3. arrayname can be one of `norm` or `dmude` 

53 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used 

54 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 

55 will be used (rounded). 

56 5. The optimal number of components may be best found from PCA. If set to None, 

57 a search will be done for ncomps that gives the lowest RMSE_CV. 

58 """ 

59 if not HAS_SKLEARN: 

60 raise ImportError("scikit-learn not installed") 

61 

62 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

63 groupnames = [] 

64 ydat = [] 

65 for g in groups: 

66 groupnames.append(getattr(g, 'filename', 

67 getattr(g, 'groupname', repr(g)))) 

68 val = getattr(g, varname, None) 

69 if val is None: 

70 raise Value("group '%s' does not have attribute '%s'" % (g, varname)) 

71 ydat.append(val) 

72 ydat = np.array(ydat) 

73 

74 nvals = len(groups) 

75 

76 kws['scale'] = scale 

77 kws['n_components'] = ncomps 

78 

79 model = PLSRegression(**kws) 

80 

81 rmse_cv = None 

82 if not skip_cv: 

83 if cv_folds is None: 

84 cv_folds = int(round(np.sqrt(nvals))) 

85 if cv_repeats is None: 

86 cv_repeats = int(round(np.sqrt(nvals)) - 1) 

87 

88 resid = [] 

89 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) 

90 for ctrain, ctest in cv.split(range(nvals)): 

91 model.fit(spectra[ctrain, :], ydat[ctrain]) 

92 ypred = model.predict(spectra[ctest, :]) 

93 if len(ypred.shape) == 2: 

94 ypred = ypred[:, 0] 

95 

96 resid.extend((ypred - ydat[ctest]).tolist()) 

97 resid = np.array(resid) 

98 rmse_cv = np.sqrt( (resid**2).mean() ) 

99 

100 # final fit without cross-validation 

101 model = PLSRegression(**kws) 

102 out = model.fit(spectra, ydat) 

103 

104 ypred = model.predict(spectra) 

105 if len(ypred.shape) == 2: 

106 ypred = ypred[:, 0] 

107 

108 rmse = np.sqrt(((ydat - ypred)**2).mean()) 

109 

110 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred, 

111 coefs=model.x_weights_, loadings=model.x_loadings_, 

112 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv, 

113 rmse=rmse, model=model, varname=varname, 

114 arrayname=arrayname, scale=scale, groupnames=groupnames, 

115 keywords=kws) 

116 

117 

118 

119def lasso_train(groups, varname='valence', arrayname='norm', alpha=None, 

120 use_lars=True, fit_intercept=True, normalize=True, 

121 cv_folds=None, cv_repeats=None, skip_cv=False, 

122 xmin=-np.inf, xmax=np.inf, **kws): 

123 

124 """use a list of data groups to train a Lasso/LassoLars model 

125 

126 Arguments 

127 --------- 

128 groups list of groups to use as components 

129 varname name of characteristic value to model ['valence'] 

130 arrayname string of array name to be fit (see Note 3) ['norm'] 

131 xmin x-value for start of fit range [-inf] 

132 xmax x-value for end of fit range [+inf] 

133 alpha alpha parameter for LassoLars (See Note 5) [None] 

134 use_lars bool to use LassoLars instead of Lasso [True] 

135 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None] 

136 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None] 

137 skip_cv bool to skip doing Cross-Validation [None] 

138 

139 Returns 

140 ------- 

141 group with trained LassoLars model, to be used with lasso_predict 

142 Notes 

143 ----- 

144 1. The group members for the components must match each other 

145 in data content and array names. 

146 2. all grouops must have an attribute (scalar value) for `varname` 

147 3. arrayname can be one of `norm` or `dmude` 

148 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used 

149 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1 

150 will be used (rounded). 

151 5. alpha is the regularization parameter. if alpha is None it will 

152 be set using LassoLarsCV 

153 """ 

154 if not HAS_SKLEARN: 

155 raise ImportError("scikit-learn not installed") 

156 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax) 

157 groupnames = [] 

158 ydat = [] 

159 for g in groups: 

160 groupnames.append(getattr(g, 'filename', 

161 getattr(g, 'groupname', repr(g)))) 

162 val = getattr(g, varname, None) 

163 if val is None: 

164 raise Value("group '%s' does not have attribute '%s'" % (g, varname)) 

165 ydat.append(val) 

166 ydat = np.array(ydat) 

167 

168 nvals = len(groups) 

169 ymean = norms = nonzero = None 

170 spectra_scaled = spectra[:] 

171 if fit_intercept and normalize: 

172 s_scalar = StandardScaler().fit(spectra) 

173 spectra_scaled = s_scalar.transform(spectra) 

174 

175 kws.update(dict(fit_intercept=fit_intercept)) 

176 creator = LassoLars if use_lars else Lasso 

177 model = None 

178 

179 npts = xdat.shape[0] 

180 rmse_cv = None 

181 if not skip_cv: 

182 if cv_folds is None: 

183 cv_folds = int(round(np.sqrt(nvals))) 

184 if cv_repeats is None: 

185 cv_repeats = int(round(np.sqrt(nvals)) - 1) 

186 

187 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats) 

188 if alpha is None: 

189 lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7, 

190 max_iter=1e7, eps=1.e-12, **kws) 

191 lcvmod.fit(spectra_scaled, ydat) 

192 alpha = lcvmod.alpha_ 

193 # if normalize: 

194 # alpha = alpha / np.sqrt(npts) 

195 model = creator(alpha=alpha, **kws) 

196 resid = [] 

197 for ctrain, ctest in cv.split(range(nvals)): 

198 model.fit(spectra_scaled[ctrain, :], ydat[ctrain]) 

199 ypred = model.predict(spectra_scaled[ctest, :]) 

200 resid.extend((ypred - ydat[ctest]).tolist()) 

201 resid = np.array(resid) 

202 rmse_cv = np.sqrt( (resid**2).mean() ) 

203 

204 if alpha is None: 

205 cvmod = creator(**kws) 

206 cvmod.fit(spectra_scaled, ydat) 

207 alpha = cvmod.alpha_ 

208 # alpha = alpha / np.sqrt(npts) 

209 

210 if model is None: 

211 model = creator(alpha=alpha, **kws) 

212 

213 # final fit without cross-validation 

214 out = model.fit(spectra_scaled, ydat) 

215 # print("model fit out: ", out) 

216 ypred = model.predict(spectra_scaled) 

217 

218 rmse = np.sqrt(((ydat - ypred)**2).mean()) 

219 

220 return Group(x=xdat, spectra=spectra, spectra_scaled=spectra_scaled, 

221 ydat=ydat, ypred=ypred, 

222 alpha=alpha, active=model.active_, coefs=model.coef_, 

223 cv_folds=cv_folds, cv_repeats=cv_repeats, 

224 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname, 

225 arrayname=arrayname, fit_intercept=fit_intercept, 

226 normalize=normalize, groupnames=groupnames, keywords=kws) 

227 

228 

229def _predict(group, model): 

230 """internal use """ 

231 # generate arrays and interpolate components onto the unknown x array 

232 xdat, ydat = get_arrays(group, model.arrayname) 

233 if xdat is None or ydat is None: 

234 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname) 

235 

236 spectra = interp(xdat, ydat, model.x, kind='cubic') 

237 spectra.shape = (1, len(spectra)) 

238 return model.model.predict(spectra)[0] 

239 

240def lasso_predict(group, model): 

241 """ 

242 Predict the external value for a group based on a Lasso model 

243 

244 Arguments 

245 --------- 

246 group group with data to fit 

247 model Lasso/LassoLars model as found from lasso_train() 

248 

249 Returns 

250 ------- 

251 predict value of external variable for the group 

252 """ 

253 valid = (isgroup(model) and hasattr(model, 'model') and 

254 hasattr(model, 'x') and hasattr(model, 'arrayname') and 

255 model.model.__repr__().startswith('Lasso')) 

256 if not valid: 

257 raise ValueError("lasso_predict needs a Lasso training model") 

258 return _predict(group, model) 

259 

260def pls_predict(group, model): 

261 """ 

262 Predict the external value for a group based on a PLS model 

263 

264 Arguments 

265 --------- 

266 group group with data to fit 

267 model PLS model as found from pls_train() 

268 

269 Returns 

270 ------- 

271 predict value of external variable for the group 

272 """ 

273 valid = (isgroup(model) and hasattr(model, 'model') and 

274 hasattr(model, 'x') and hasattr(model, 'arrayname') and 

275 model.model.__repr__().startswith('PLS')) 

276 if not valid: 

277 raise ValueError("pls_predict needs a PLS training model") 

278 return _predict(group, model)