Coverage for larch/math/learn_regress.py: 12%
121 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-10-16 21:04 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2024-10-16 21:04 +0000
1#!/usr/bin/env python
2"""
3generalizerd linear models and feature selection using machine learning methods
4including Partial Least Squares (PLS) and L1-Regularized Linear Modeling (Lasso).
6These methods are built on the methods from scikit-learn
7"""
8import numpy as np
10try:
11 from sklearn.cross_decomposition import PLSRegression
12 from sklearn.model_selection import RepeatedKFold
13 from sklearn.preprocessing import StandardScaler
14 from sklearn.linear_model import LassoLarsCV, LassoLars, Lasso
15 HAS_SKLEARN = True
16except ImportError:
17 HAS_SKLEARN = False
19from .. import Group, isgroup
21from .utils import interp
22from .lincombo_fitting import get_arrays, groups2matrix
24def pls_train(groups, varname='valence', arrayname='norm', scale=True,
25 ncomps=2, cv_folds=None, cv_repeats=None, skip_cv=False,
26 xmin=-np.inf, xmax=np.inf, **kws):
28 """use a list of data groups to train a Partial Least Squares model
30 Arguments
31 ---------
32 groups list of groups to use as components
33 varname name of characteristic value to model ['valence']
34 arrayname string of array name to be fit (see Note 3) ['norm']
35 xmin x-value for start of fit range [-inf]
36 xmax x-value for end of fit range [+inf]
37 scale bool to scale data [True]
38 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None]
39 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None]
40 skip_cv bool to skip doing Cross-Validation [None]
41 ncomps number of independent components (See Note 5) [2]
43 Returns
44 -------
45 group with trained PSLResgession, to be used with pls_predict
47 Notes
48 -----
49 1. The group members for the components must match each other
50 in data content and array names.
51 2. all grouops must have an attribute (scalar value) for `varname`
52 3. arrayname can be one of `norm` or `dmude`
53 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used
54 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1
55 will be used (rounded).
56 5. The optimal number of components may be best found from PCA. If set to None,
57 a search will be done for ncomps that gives the lowest RMSE_CV.
58 """
59 if not HAS_SKLEARN:
60 raise ImportError("scikit-learn not installed")
62 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
63 groupnames = []
64 ydat = []
65 for g in groups:
66 groupnames.append(getattr(g, 'filename',
67 getattr(g, 'groupname', repr(g))))
68 val = getattr(g, varname, None)
69 if val is None:
70 raise Value("group '%s' does not have attribute '%s'" % (g, varname))
71 ydat.append(val)
72 ydat = np.array(ydat)
74 nvals = len(groups)
76 kws['scale'] = scale
77 kws['n_components'] = ncomps
79 model = PLSRegression(**kws)
81 rmse_cv = None
82 if not skip_cv:
83 if cv_folds is None:
84 cv_folds = int(round(np.sqrt(nvals)))
85 if cv_repeats is None:
86 cv_repeats = int(round(np.sqrt(nvals)) - 1)
88 resid = []
89 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
90 for ctrain, ctest in cv.split(range(nvals)):
91 model.fit(spectra[ctrain, :], ydat[ctrain])
92 ypred = model.predict(spectra[ctest, :])
93 if len(ypred.shape) == 2:
94 ypred = ypred[:, 0]
96 resid.extend((ypred - ydat[ctest]).tolist())
97 resid = np.array(resid)
98 rmse_cv = np.sqrt( (resid**2).mean() )
100 # final fit without cross-validation
101 model = PLSRegression(**kws)
102 out = model.fit(spectra, ydat)
104 ypred = model.predict(spectra)
105 if len(ypred.shape) == 2:
106 ypred = ypred[:, 0]
108 rmse = np.sqrt(((ydat - ypred)**2).mean())
110 return Group(x=xdat, spectra=spectra, ydat=ydat, ypred=ypred,
111 coefs=model.x_weights_, loadings=model.x_loadings_,
112 cv_folds=cv_folds, cv_repeats=cv_repeats, rmse_cv=rmse_cv,
113 rmse=rmse, model=model, varname=varname,
114 arrayname=arrayname, scale=scale, groupnames=groupnames,
115 keywords=kws)
119def lasso_train(groups, varname='valence', arrayname='norm', alpha=None,
120 use_lars=True, fit_intercept=True, normalize=True,
121 cv_folds=None, cv_repeats=None, skip_cv=False,
122 xmin=-np.inf, xmax=np.inf, **kws):
124 """use a list of data groups to train a Lasso/LassoLars model
126 Arguments
127 ---------
128 groups list of groups to use as components
129 varname name of characteristic value to model ['valence']
130 arrayname string of array name to be fit (see Note 3) ['norm']
131 xmin x-value for start of fit range [-inf]
132 xmax x-value for end of fit range [+inf]
133 alpha alpha parameter for LassoLars (See Note 5) [None]
134 use_lars bool to use LassoLars instead of Lasso [True]
135 cv_folds None or number of Cross-Validation folds (Seee Note 4) [None]
136 cv_repeats None or number of Cross-Validation repeats (Seee Note 4) [None]
137 skip_cv bool to skip doing Cross-Validation [None]
139 Returns
140 -------
141 group with trained LassoLars model, to be used with lasso_predict
142 Notes
143 -----
144 1. The group members for the components must match each other
145 in data content and array names.
146 2. all grouops must have an attribute (scalar value) for `varname`
147 3. arrayname can be one of `norm` or `dmude`
148 4. Cross-Validation: if cv_folds is None, sqrt(len(groups)) will be used
149 (rounded to integer). if cv_repeats is None, sqrt(len(groups))-1
150 will be used (rounded).
151 5. alpha is the regularization parameter. if alpha is None it will
152 be set using LassoLarsCV
153 """
154 if not HAS_SKLEARN:
155 raise ImportError("scikit-learn not installed")
156 xdat, spectra = groups2matrix(groups, arrayname, xmin=xmin, xmax=xmax)
157 groupnames = []
158 ydat = []
159 for g in groups:
160 groupnames.append(getattr(g, 'filename',
161 getattr(g, 'groupname', repr(g))))
162 val = getattr(g, varname, None)
163 if val is None:
164 raise Value("group '%s' does not have attribute '%s'" % (g, varname))
165 ydat.append(val)
166 ydat = np.array(ydat)
168 nvals = len(groups)
169 ymean = norms = nonzero = None
170 spectra_scaled = spectra[:]
171 if fit_intercept and normalize:
172 s_scalar = StandardScaler().fit(spectra)
173 spectra_scaled = s_scalar.transform(spectra)
175 kws.update(dict(fit_intercept=fit_intercept))
176 creator = LassoLars if use_lars else Lasso
177 model = None
179 npts = xdat.shape[0]
180 rmse_cv = None
181 if not skip_cv:
182 if cv_folds is None:
183 cv_folds = int(round(np.sqrt(nvals)))
184 if cv_repeats is None:
185 cv_repeats = int(round(np.sqrt(nvals)) - 1)
187 cv = RepeatedKFold(n_splits=cv_folds, n_repeats=cv_repeats)
188 if alpha is None:
189 lcvmod = LassoLarsCV(cv=cv, max_n_alphas=1e7,
190 max_iter=1e7, eps=1.e-12, **kws)
191 lcvmod.fit(spectra_scaled, ydat)
192 alpha = lcvmod.alpha_
193 # if normalize:
194 # alpha = alpha / np.sqrt(npts)
195 model = creator(alpha=alpha, **kws)
196 resid = []
197 for ctrain, ctest in cv.split(range(nvals)):
198 model.fit(spectra_scaled[ctrain, :], ydat[ctrain])
199 ypred = model.predict(spectra_scaled[ctest, :])
200 resid.extend((ypred - ydat[ctest]).tolist())
201 resid = np.array(resid)
202 rmse_cv = np.sqrt( (resid**2).mean() )
204 if alpha is None:
205 cvmod = creator(**kws)
206 cvmod.fit(spectra_scaled, ydat)
207 alpha = cvmod.alpha_
208 # alpha = alpha / np.sqrt(npts)
210 if model is None:
211 model = creator(alpha=alpha, **kws)
213 # final fit without cross-validation
214 out = model.fit(spectra_scaled, ydat)
215 # print("model fit out: ", out)
216 ypred = model.predict(spectra_scaled)
218 rmse = np.sqrt(((ydat - ypred)**2).mean())
220 return Group(x=xdat, spectra=spectra, spectra_scaled=spectra_scaled,
221 ydat=ydat, ypred=ypred,
222 alpha=alpha, active=model.active_, coefs=model.coef_,
223 cv_folds=cv_folds, cv_repeats=cv_repeats,
224 rmse_cv=rmse_cv, rmse=rmse, model=model, varname=varname,
225 arrayname=arrayname, fit_intercept=fit_intercept,
226 normalize=normalize, groupnames=groupnames, keywords=kws)
229def _predict(group, model):
230 """internal use """
231 # generate arrays and interpolate components onto the unknown x array
232 xdat, ydat = get_arrays(group, model.arrayname)
233 if xdat is None or ydat is None:
234 raise ValueError("cannot get arrays for arrayname='%s'" % arrayname)
236 spectra = interp(xdat, ydat, model.x, kind='cubic')
237 spectra.shape = (1, len(spectra))
238 return model.model.predict(spectra)[0]
240def lasso_predict(group, model):
241 """
242 Predict the external value for a group based on a Lasso model
244 Arguments
245 ---------
246 group group with data to fit
247 model Lasso/LassoLars model as found from lasso_train()
249 Returns
250 -------
251 predict value of external variable for the group
252 """
253 valid = (isgroup(model) and hasattr(model, 'model') and
254 hasattr(model, 'x') and hasattr(model, 'arrayname') and
255 model.model.__repr__().startswith('Lasso'))
256 if not valid:
257 raise ValueError("lasso_predict needs a Lasso training model")
258 return _predict(group, model)
260def pls_predict(group, model):
261 """
262 Predict the external value for a group based on a PLS model
264 Arguments
265 ---------
266 group group with data to fit
267 model PLS model as found from pls_train()
269 Returns
270 -------
271 predict value of external variable for the group
272 """
273 valid = (isgroup(model) and hasattr(model, 'model') and
274 hasattr(model, 'x') and hasattr(model, 'arrayname') and
275 model.model.__repr__().startswith('PLS'))
276 if not valid:
277 raise ValueError("pls_predict needs a PLS training model")
278 return _predict(group, model)