Coverage for larch/io/columnfile.py: 51%

348 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-10-16 21:04 +0000

1#!/usr/bin/env python 

2""" 

3 Larch column file reader: read_ascii 

4""" 

5import os 

6import sys 

7import time 

8import string 

9from pathlib import Path 

10from collections import namedtuple 

11import numpy as np 

12from dateutil.parser import parse as dateparse 

13from math import log10 

14from larch import Group 

15from larch.symboltable import isgroup 

16from ..utils import read_textfile, format_exception, gformat, fix_varname 

17from .xafs_beamlines import guess_beamline 

18 

19nanresult = namedtuple('NanResult', ('file_ok', 'message', 'nan_rows', 

20 'nan_cols', 'inf_rows', 'inf_cols')) 

21 

22TINY = 1.e-7 

23MAX_FILESIZE = 100*1024*1024 # 100 Mb limit 

24COMMENTCHARS = '#;%*!$' 

25 

26def look_for_nans(path): 

27 """ 

28 look for Nans and Infs in an ASCII data file 

29 

30 Arguments: 

31 path (string): full path to ASCII column file 

32 

33 Returns: 

34 NanResult, named tuple with elements 

35 

36 'file_ok' : bool, whether data is read and contains no Nans or Infs 

37 'message' : exception message if file cannot be read at all or 

38 'has nans', 'has infs' or 'has nans and infs' 

39 `nan_rows`: list of rows containing Nans 

40 `nan_cols`: list of columns containing Nans 

41 `inf_rows`: list of rows containing Infs 

42 `inf_cols`: list of columns containing Infs 

43 """ 

44 

45 nan_rows, nan_cols, inf_rows, inf_cols = [], [], [], [] 

46 try: 

47 dat = read_ascii(path) 

48 except: 

49 print(''.join(format_exception())) 

50 return nanresult(False, f'could not read file {path}', 

51 nan_rows, nan_cols, inf_rows, inf_cols) 

52 if len(dat.data) < 1: 

53 return nanresult(False, f'no data in file {path}', 

54 nan_rows, nan_cols, inf_rows, inf_cols) 

55 if np.all(np.isfinite(dat.data)): 

56 return nanresult(True, 'file ok', 

57 nan_rows, nan_cols, inf_rows, inf_cols) 

58 

59 msg = 'unknown' 

60 nanvals = np.where(np.isnan(dat.data)) 

61 if len(nanvals[0]) > 0: 

62 msg = 'has nans' 

63 for icol in nanvals[0]: 

64 if icol not in nan_cols: 

65 nan_cols.append(icol) 

66 for irow in nanvals[1]: 

67 if irow not in nan_rows: 

68 nan_rows.append(irow) 

69 

70 infvals = np.where(np.isinf(dat.data)) 

71 if len(infvals[0]) > 0: 

72 if len(msg) == 0: 

73 msg = 'has infs' 

74 else: 

75 msg = 'has nans and infs' 

76 for icol in infvals[0]: 

77 if icol not in inf_cols: 

78 inf_cols.append(icol) 

79 for irow in infvals[1]: 

80 if irow not in inf_rows: 

81 inf_rows.append(irow) 

82 

83 return nanresult(False, msg, nan_rows, nan_cols, inf_rows, inf_cols) 

84 

85 

86def getfloats(txt, allow_times=True): 

87 """convert a line of numbers into a list of floats, 

88 as for reading a file with columnar numerical data. 

89 

90 Arguments 

91 --------- 

92 txt (str) : line of text to parse 

93 allow_times (bool): whether to support time stamps [True] 

94 

95 Returns 

96 ------- 

97 list with each entry either a float or None 

98 

99 Notes 

100 ----- 

101 The `allow_times` will try to support common date-time strings 

102 using the dateutil module, returning a numerical value as the 

103 Unix timestamp, using 

104 time.mktime(dateutil.parser.parse(word).timetuple()) 

105 """ 

106 words = [w.strip() for w in txt.replace(',', ' ').split()] 

107 mktime = time.mktime 

108 for i, w in enumerate(words): 

109 val = None 

110 try: 

111 val = float(w) 

112 except ValueError: 

113 try: 

114 val = mktime(dateparse(w).timetuple()) 

115 except ValueError: 

116 pass 

117 words[i] = val 

118 return words 

119 

120def colname(txt): 

121 return fix_varname(txt.strip().lower()).replace('.', '_') 

122 

123 

124def parse_labelline(labelline, header): 

125 """ 

126 parse the 'label line' for an ASCII file. 

127 

128 

129 This is meant to handle some special cases of XAFS data collected at a variety of sources 

130 """ 

131 pass 

132 

133 

134def sum_fluor_channels(dgroup, roi, icr=None, ocr=None, ltime=None, label=None, 

135 add_data=True, **kws): 

136 """build summed, deadtime-corrected fluorescence spectrum for a Group 

137 

138 Arguments 

139 --------- 

140 dgroup data group 

141 roi list in array indices for ROI 

142 icr None or list of array indices for ICR [None] 

143 ocr None or list of array indices for OCR [None] 

144 ltime None or list of array indices for LTIME [None] 

145 label None or label for the summed, corrected array [None] 

146 add_data bool, whether to add label and data to datgroup [False] 

147 

148 Returns 

149 ------- 

150 label, ndarray with summed, deadtime-corrected data 

151 

152 if add_data is True, the ndarray will also be appended to `dgroup.data, 

153 and the label will be appended to dgroup.array_labels 

154 

155 

156 Notes 

157 ------ 

158 1. The output array will be Sum[ roi*icr/(ocr*ltime) ] 

159 2. The default label will be like the array label for the 'dtcN' + first ROI 

160 3. icr, ocr, or ltime can be `None`, '1.0', '-1', or '1' to mean '1.0' or 

161 arrays of indices for the respective components: must be the same lenght as roi 

162 

163 4. an array index of -1 will indicate 'bad channel' and be skipped for ROI 

164 or set to 1.0 for icr, ocr, or ltime 

165 

166 5. if the list of arrays in roi, icr, ocr, or ltime are otherwise out-of-range, 

167 the returned (label, data) will be (None, None) 

168 

169 """ 

170 nchans = len(roi) 

171 if icr in ('1.0', -1, 1, None): 

172 icr = [-1]*nchans 

173 if ocr in ('1.0', -1, 1, None): 

174 ocr = [-1]*nchans 

175 if ltime in ('1.0', -1, 1, None): 

176 ltime = [-1]*nchans 

177 if len(ltime) != nchans or len(icr) != nchans or len(ocr) != nchans: 

178 raise Value("arrays of indices for for roi, icr, ocr, and ltime must be the same length") 

179 

180 narr, npts = dgroup.data.shape 

181 nused = 0 

182 sum = 0.0 

183 olabel = None 

184 def get_data(arr, idx): 

185 iarr = arr[idx] 

186 if iarr < 0: 

187 return iarr, 1.0 

188 if iarr > narr-1: 

189 return None, None 

190 return iarr, dgroup.data[iarr, :] 

191 

192 for pchan in range(nchans): 

193 droi = dicr = docr = dltime = 1.0 

194 iarr, droi = get_data(roi, pchan) 

195 if isinstance(droi, np.ndarray): 

196 if olabel is None: 

197 olabel = dgroup.array_labels[iarr] 

198 elif iarr is None: 

199 return (None, None) 

200 else: # index of -1 here means "skip" 

201 continue 

202 

203 iarr, dicr = get_data(icr, pchan) 

204 if iarr is None: return (None, None) 

205 

206 iarr, docr = get_data(ocr, pchan) 

207 if iarr is None: return (None, None) 

208 

209 iarr, docr = get_data(ocr, pchan) 

210 if iarr is None: return (None, None) 

211 

212 iarr, dltime= get_data(ltime, pchan) 

213 if iarr is None: return (None, None) 

214 

215 sum += droi*dicr/(docr*dltime) 

216 nused += 1 

217 

218 if label is None: 

219 if olabel is None: olabel = 'ROI' 

220 label = olabel = f'dtc{nused}_{olabel}' 

221 n = 1 

222 while label in dgroup.array_labels: 

223 n += 1 

224 label = f'{olabel}_{n}' 

225 if add_data: 

226 dgroup.array_labels.append(label) 

227 dgroup.data = np.append(dgroup.data, sum.reshape(1, len(sum)), axis=0) 

228 return (label, sum) 

229 

230 

231 

232def read_ascii(filename, labels=None, simple_labels=False, 

233 sort=False, sort_column=0): 

234 """read a column ascii column file, returning a group 

235 containing the data extracted from the file. 

236 

237 Arguments: 

238 filename (str): name of file to read 

239 labels (ist or None) : list of labels to use for array labels [None] 

240 simple_labels (bool) : whether to force simple column labels (note 1) [False] 

241 sort (bool) : whether to sort row data (note 2) [False] 

242 sort_column (int) : column to use for sorting (note 2) [0] 

243 

244 Returns: 

245 Group 

246 

247 A data group containing data read from file, with several attributes: 

248 

249 | filename : text name of the file. 

250 | array_labels : array labels, names of 1-D arrays. 

251 | data : 2-dimensional data (ncolumns, nrows) with all data. 

252 | header : array of text lines of the header. 

253 | footer : array of text lines of the footer (text after the numerical data) 

254 | attrs : group of attributes parsed from header lines. 

255 

256 Notes: 

257 1. array labels. If `labels` is `None` (the default), column labels 

258 and names of 1d arrays will be guessed from the file header. This often 

259 means parsing the final header line, but tagged column files from several XAFS 

260 beamlines will be tried and used if matching. Column labels may be like 'col1', 

261 'col2', etc if suitable column labels cannot be guessed. 

262 These labels will be used as names for the 1-d arrays from each column. 

263 If `simple_labels` is `True`, the names 'col1', 'col2' etc will be used 

264 regardless of the column labels found in the file. 

265 

266 2. sorting. Data can be sorted to be in increasing order of any column, 

267 by giving the column index (starting from 0). 

268 

269 3. header parsing. If header lines are of the forms of 

270 

271 | KEY : VAL 

272 | KEY = VAL 

273 

274 these will be parsed into a 'attrs' dictionary in the returned group. 

275 

276 Examples: 

277 

278 >>> feo_data = read_ascii('feo_rt1.dat') 

279 >>> show(g)a 

280 == Group ascii_file feo_rt1.dat: 0 methods, 8 attributes == 

281 array_labels: ['energy', 'xmu', 'i0'] 

282 attrs: <Group header attributes from feo_rt1.dat> 

283 data: array<shape=(3, 412), type=dtype('float64')> 

284 energy: array<shape=(412,), type=dtype('float64')> 

285 filename: 'feo_rt1.dat' 

286 header: ['# room temperature FeO', '# data from 20-BM, 2001, as part of NXS school', ... ] 

287 i0: array<shape=(412,), type=dtype('float64')> 

288 xmu: array<shape=(412,), type=dtype('float64')> 

289 

290 See Also: 

291 read_xdi, write_ascii 

292 

293 """ 

294 if not Path(filename).is_file(): 

295 raise OSError("File not found: '%s'" % filename) 

296 if os.stat(filename).st_size > MAX_FILESIZE: 

297 raise OSError("File '%s' too big for read_ascii()" % filename) 

298 

299 text = read_textfile(filename) 

300 lines = text.split('\n') 

301 

302 ncol = None 

303 data, footers, headers = [], [], [] 

304 

305 lines.reverse() 

306 section = 'FOOTER' 

307 

308 for line in lines: 

309 line = line.strip() 

310 if len(line) < 1: 

311 continue 

312 # look for section transitions (going from bottom to top) 

313 if section == 'FOOTER' and not None in getfloats(line): 

314 section = 'DATA' 

315 elif section == 'DATA' and None in getfloats(line): 

316 section = 'HEADER' 

317 

318 # act of current section: 

319 if section == 'FOOTER': 

320 footers.append(line) 

321 elif section == 'HEADER': 

322 headers.append(line) 

323 elif section == 'DATA': 

324 rowdat = getfloats(line) 

325 if ncol is None: 

326 ncol = len(rowdat) 

327 elif ncol > len(rowdat): 

328 rowdat.extend([np.nan]*(ncol-len(rowdat))) 

329 elif ncol < len(rowdat): 

330 for i in data: 

331 i.extend([np.nan]*(len(rowdat)-ncol)) 

332 ncol = len(rowdat) 

333 data.append(rowdat) 

334 

335 # reverse header, footer, data, convert to arrays 

336 footers.reverse() 

337 headers.reverse() 

338 data.reverse() 

339 data = np.array(data).transpose() 

340 

341 # try to parse attributes from header text 

342 header_attrs = {} 

343 for hline in headers: 

344 hline = hline.strip().replace('\t', ' ') 

345 if len(hline) < 1: continue 

346 if hline[0] in COMMENTCHARS: 

347 hline = hline[1:].strip() 

348 keywds = [] 

349 if ':' in hline: # keywords in 'x: 22' 

350 words = hline.split(':', 1) 

351 keywds = words[0].split() 

352 elif '=' in hline: # keywords in 'x = 22' 

353 words = hline.split('=', 1) 

354 keywds = words[0].split() 

355 if len(keywds) == 1: 

356 key = colname(keywds[0]) 

357 if key.startswith('_'): 

358 key = key[1:] 

359 if len(words) > 1: 

360 header_attrs[key] = words[1].strip() 

361 

362 

363 fpath = Path(filename).absolute() 

364 filename = fpath.as_posix() 

365 attrs = {'filename': filename} 

366 group = Group(name='ascii_file %s' % filename, 

367 path=filename, filename=fpath.name, 

368 header=headers, data=[], array_labels=[]) 

369 

370 if len(data) == 0: 

371 return group 

372 

373 if sort and sort_column >= 0 and sort_column < ncol: 

374 data = data[:, np.argsort(data[sort_column])] 

375 

376 group.data = data 

377 

378 if len(footers) > 0: 

379 group.footer = footers 

380 

381 group.attrs = Group(name='header attributes from %s' % filename) 

382 for key, val in header_attrs.items(): 

383 setattr(group.attrs, key, val) 

384 

385 if isinstance(labels, str): 

386 for bchar in ',#@%|:*': 

387 labels = labels.replace(bchar, '') 

388 labels = labels.split() 

389 if labels is None and not simple_labels: 

390 bldat = guess_beamline(headers)(headers) 

391 labels = bldat.get_array_labels() 

392 

393 if getattr(bldat, 'energy_units', 'eV') != 'eV': 

394 group.energy_units = bldat.energy_units 

395 if getattr(bldat, 'energy_column', 1) != 1: 

396 group.energy_column = bldat.energy_column 

397 if getattr(bldat, 'mono_dspace', -1) > 0: 

398 group.mono_dspace = bldat.mono_dspace 

399 

400 set_array_labels(group, labels=labels, simple_labels=simple_labels) 

401 return group 

402 

403def set_array_labels(group, labels=None, simple_labels=False, 

404 save_oldarrays=False): 

405 

406 """set array names for a group from its 2D `data` array. 

407 

408 Arguments 

409 ---------- 

410 labels (list of strings or None) array of labels to use 

411 simple_labels (bool): flag to use ('col1', 'col2', ...) [False] 

412 save_oldarrays (bool): flag to save old array names [False] 

413 

414 

415 Returns 

416 ------- 

417 group with newly named attributes of 1D array data, and 

418 an updated `array_labels` giving the mapping of `data` 

419 columns to attribute names. 

420 

421 Notes 

422 ------ 

423 1. if `simple_labels=True` it will overwrite any values in `labels` 

424 

425 3. Array labels must be valid python names. If not enough labels 

426 are specified, or if name clashes arise, the array names may be 

427 modified, often by appending an underscore and letter or by using 

428 ('col1', 'col2', ...) etc. 

429 

430 4. When `save_oldarrays` is `False` (the default), arrays named in the 

431 current `group.array_labels` will be erased. Other arrays and 

432 attributes will not be changed. 

433 

434 """ 

435 write = sys.stdout.write 

436 if not hasattr(group, 'data'): 

437 write("cannot set array labels for group '%s': no `data`\n" % repr(group)) 

438 return 

439 

440 # clear old arrays, if desired 

441 oldlabels = getattr(group, 'array_labels', None) 

442 if oldlabels is not None and not save_oldarrays: 

443 for attr in oldlabels: 

444 if hasattr(group, attr): 

445 delattr(group, attr) 

446 

447 ncols, nrow = group.data.shape 

448 

449 #### 

450 # step 1: determine user-defined labels from input options 

451 # generating array `tlabels` for test labels 

452 # 

453 # generate simple column labels, used as backup 

454 clabels = ['col%d' % (i+1) for i in range(ncols)] 

455 

456 if isinstance(labels, str): 

457 labels = labels.split() 

458 

459 

460 tlabels = labels 

461 # if simple column names requested or above failed, use simple column names 

462 if simple_labels or tlabels is None: 

463 tlabels = clabels 

464 

465 #### 

466 # step 2: check input and correct problems 

467 # 2.a: check for not enough and too many labels 

468 if len(tlabels) < ncols: 

469 for i in range(len(tlabels), ncols): 

470 tlabels.append("col%i" % (i+1)) 

471 elif len(tlabels) > ncols: 

472 tlabels = tlabels[:ncols] 

473 

474 # 2.b: check for names that clash with group attributes 

475 # or that are repeated, append letter. 

476 reserved_names = ('data', 'array_labels', 'filename', 

477 'attrs', 'header', 'footer') 

478 extras = string.ascii_lowercase 

479 labels = [] 

480 for i in range(ncols): 

481 lname = tlabels[i] 

482 if lname in reserved_names or lname in labels: 

483 lname = lname + '_a' 

484 j = 0 

485 while lname in labels: 

486 j += 1 

487 if j == len(extras): 

488 break 

489 lname = "%s_%s" % (tlabels[i], extras[j]) 

490 if lname in labels: 

491 lname = clabels[i] 

492 labels.append(lname) 

493 

494 #### 

495 # step 3: assign attribue names, set 'array_labels' 

496 for i, name in enumerate(labels): 

497 setattr(group, name, group.data[i]) 

498 group.array_labels = labels 

499 return group 

500 

501 

502def write_ascii(filename, *args, commentchar='#', label=None, header=None): 

503 """ 

504 write a list of items to an ASCII column file 

505 

506 Arguments: 

507 args (list of groups): list of groups to write. 

508 commentchar (str) : character for comment ('#') 

509 label (str on None): array label line (autogenerated) 

510 header (list of strings): array of strings for header 

511 

512 Returns: 

513 None 

514 

515 Examples: 

516 >>> write_ascii('myfile', group.energy, group.norm, header=['comment1', 'comment2'] 

517 

518 """ 

519 ARRAY_MINLEN = 2 

520 write = sys.stdout.write 

521 com = commentchar 

522 label = label 

523 if header is None: 

524 header = [] 

525 

526 arrays = [] 

527 arraylen = None 

528 

529 for arg in args: 

530 if isinstance(arg, np.ndarray): 

531 if len(arg) > ARRAY_MINLEN: 

532 if arraylen is None: 

533 arraylen = len(arg) 

534 else: 

535 arraylen = min(arraylen, len(arg)) 

536 arrays.append(arg) 

537 else: 

538 header.append(repr(arg)) 

539 

540 else: 

541 header.append(repr(arg)) 

542 

543 if arraylen is None: 

544 raise ValueError("write_ascii() need %i or more elements in arrays." % ARRAY_MINLEN) 

545 

546 buff = [] 

547 if header is None: 

548 buff = ['%s Output from Larch %s' % (com, time.ctime())] 

549 for s in header: 

550 buff.append('%s %s' % (com, s)) 

551 buff.append('%s---------------------------------'% com) 

552 if label is None: 

553 label = (' '*13).join(['col%d' % (i+1) for i in range(len(arrays))]) 

554 buff.append('# %s' % label) 

555 

556 arrays = np.array(arrays) 

557 for i in range(arraylen): 

558 w = [" %s" % gformat(val[i], length=14) for val in arrays] 

559 buff.append(' '.join(w)) 

560 buff.append('') 

561 

562 with open(filename, 'w', encoding=sys.getdefaultencoding()) as fout: 

563 fout.write('\n'.join(buff)) 

564 sys.stdout.write("wrote to file '%s'\n" % filename) 

565 

566 

567def write_group(filename, group, scalars=None, arrays=None, 

568 arrays_like=None, commentchar='#'): 

569 """(deprecated) write components of a group to an ASCII column file 

570 

571 

572 Warning: 

573 This is pretty minimal and may work poorly for large groups of complex data. 

574 Use `save_session` instead. 

575 

576 """ 

577 

578 items = dir(group) 

579 npts = 0 

580 if arrays is None: 

581 arrays = [] 

582 if scalars is None: 

583 scalars = [] 

584 

585 if arrays_like is not None and arrays_like in items: 

586 array = getattr(group, arrays_like) 

587 if isinstance(array, np.ndarray): 

588 npts = len(array) 

589 

590 for name in items: 

591 val = getattr(group, name) 

592 if isinstance(val, np.ndarray): 

593 if npts != 0 and npts == len(val) and name not in arrays: 

594 arrays.append(name) 

595 

596 header =[] 

597 for s in scalars: 

598 if s in items: 

599 val = getattr(group, s) 

600 header.append("%s = %s" % (s, val)) 

601 

602 label = ' '.join(arrays) 

603 

604 args = [] 

605 for name in arrays: 

606 if name in items: 

607 args.append(getattr(group, name)) 

608 

609 write_ascii(filename, *args, commentchar=commentchar, 

610 label=label, header=header) 

611 

612 

613def read_fdmnes(filename, **kwargs): 

614 """read [FDMNES](http://fdmnes.neel.cnrs.fr/) ascii files""" 

615 group = read_ascii(filename, **kwargs) 

616 group.header_dict = dict(filetype='FDMNES', energy_units='eV') 

617 for headline in group.header: 

618 if ("E_edge" in headline): 

619 if headline.startswith("#"): 

620 headline = headline[1:] 

621 vals = [float(v) for v in headline.split(" = ")[0].split(" ") if v] 

622 vals_names = headline.split(" = ")[1].split(", ") 

623 group.header_dict.update(dict(zip(vals_names, vals))) 

624 group.name = f'FDMNES file {filename}' 

625 group.energy += group.header_dict["E_edge"] 

626 #fix _arrlabel -> arrlabel 

627 for ilab, lab in enumerate(group.array_labels): 

628 if lab.startswith("_"): 

629 fixlab = lab[1:] 

630 group.array_labels[ilab] = fixlab 

631 delattr(group, lab) 

632 setattr(group, fixlab, group.data[ilab]) 

633 return group 

634 

635def guess_filereader(path, return_text=False): 

636 """guess function name to use to read a data file based on the file header 

637 

638 Arguments 

639 --------- 

640 path (str) : file path to be read 

641 

642 Returns 

643 ------- 

644 name of function (as a string) to use to read file 

645 if return_text: text of the read file 

646 """ 

647 text = read_textfile(path) 

648 lines = text.split('\n') 

649 line1 = lines[0].lower() 

650 reader = 'read_ascii' 

651 if 'epics scan' in line1: 

652 reader = 'read_gsescan' 

653 if 'xdi' in line1: 

654 reader = 'read_xdi' 

655 if 'epics stepscan file' in line1 : 

656 reader = 'read_gsexdi' 

657 if ("#s" in line1) or ("#f" in line1): 

658 reader = 'read_specfile' 

659 if 'fdmnes' in line1: 

660 reader = 'read_fdmnes' 

661 if return_text: 

662 return reader, text 

663 else: 

664 return reader