Coverage for larch/io/columnfile.py: 51%
348 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-10-16 21:04 +0000
« prev ^ index » next coverage.py v7.6.0, created at 2024-10-16 21:04 +0000
1#!/usr/bin/env python
2"""
3 Larch column file reader: read_ascii
4"""
5import os
6import sys
7import time
8import string
9from pathlib import Path
10from collections import namedtuple
11import numpy as np
12from dateutil.parser import parse as dateparse
13from math import log10
14from larch import Group
15from larch.symboltable import isgroup
16from ..utils import read_textfile, format_exception, gformat, fix_varname
17from .xafs_beamlines import guess_beamline
19nanresult = namedtuple('NanResult', ('file_ok', 'message', 'nan_rows',
20 'nan_cols', 'inf_rows', 'inf_cols'))
22TINY = 1.e-7
23MAX_FILESIZE = 100*1024*1024 # 100 Mb limit
24COMMENTCHARS = '#;%*!$'
26def look_for_nans(path):
27 """
28 look for Nans and Infs in an ASCII data file
30 Arguments:
31 path (string): full path to ASCII column file
33 Returns:
34 NanResult, named tuple with elements
36 'file_ok' : bool, whether data is read and contains no Nans or Infs
37 'message' : exception message if file cannot be read at all or
38 'has nans', 'has infs' or 'has nans and infs'
39 `nan_rows`: list of rows containing Nans
40 `nan_cols`: list of columns containing Nans
41 `inf_rows`: list of rows containing Infs
42 `inf_cols`: list of columns containing Infs
43 """
45 nan_rows, nan_cols, inf_rows, inf_cols = [], [], [], []
46 try:
47 dat = read_ascii(path)
48 except:
49 print(''.join(format_exception()))
50 return nanresult(False, f'could not read file {path}',
51 nan_rows, nan_cols, inf_rows, inf_cols)
52 if len(dat.data) < 1:
53 return nanresult(False, f'no data in file {path}',
54 nan_rows, nan_cols, inf_rows, inf_cols)
55 if np.all(np.isfinite(dat.data)):
56 return nanresult(True, 'file ok',
57 nan_rows, nan_cols, inf_rows, inf_cols)
59 msg = 'unknown'
60 nanvals = np.where(np.isnan(dat.data))
61 if len(nanvals[0]) > 0:
62 msg = 'has nans'
63 for icol in nanvals[0]:
64 if icol not in nan_cols:
65 nan_cols.append(icol)
66 for irow in nanvals[1]:
67 if irow not in nan_rows:
68 nan_rows.append(irow)
70 infvals = np.where(np.isinf(dat.data))
71 if len(infvals[0]) > 0:
72 if len(msg) == 0:
73 msg = 'has infs'
74 else:
75 msg = 'has nans and infs'
76 for icol in infvals[0]:
77 if icol not in inf_cols:
78 inf_cols.append(icol)
79 for irow in infvals[1]:
80 if irow not in inf_rows:
81 inf_rows.append(irow)
83 return nanresult(False, msg, nan_rows, nan_cols, inf_rows, inf_cols)
86def getfloats(txt, allow_times=True):
87 """convert a line of numbers into a list of floats,
88 as for reading a file with columnar numerical data.
90 Arguments
91 ---------
92 txt (str) : line of text to parse
93 allow_times (bool): whether to support time stamps [True]
95 Returns
96 -------
97 list with each entry either a float or None
99 Notes
100 -----
101 The `allow_times` will try to support common date-time strings
102 using the dateutil module, returning a numerical value as the
103 Unix timestamp, using
104 time.mktime(dateutil.parser.parse(word).timetuple())
105 """
106 words = [w.strip() for w in txt.replace(',', ' ').split()]
107 mktime = time.mktime
108 for i, w in enumerate(words):
109 val = None
110 try:
111 val = float(w)
112 except ValueError:
113 try:
114 val = mktime(dateparse(w).timetuple())
115 except ValueError:
116 pass
117 words[i] = val
118 return words
120def colname(txt):
121 return fix_varname(txt.strip().lower()).replace('.', '_')
124def parse_labelline(labelline, header):
125 """
126 parse the 'label line' for an ASCII file.
129 This is meant to handle some special cases of XAFS data collected at a variety of sources
130 """
131 pass
134def sum_fluor_channels(dgroup, roi, icr=None, ocr=None, ltime=None, label=None,
135 add_data=True, **kws):
136 """build summed, deadtime-corrected fluorescence spectrum for a Group
138 Arguments
139 ---------
140 dgroup data group
141 roi list in array indices for ROI
142 icr None or list of array indices for ICR [None]
143 ocr None or list of array indices for OCR [None]
144 ltime None or list of array indices for LTIME [None]
145 label None or label for the summed, corrected array [None]
146 add_data bool, whether to add label and data to datgroup [False]
148 Returns
149 -------
150 label, ndarray with summed, deadtime-corrected data
152 if add_data is True, the ndarray will also be appended to `dgroup.data,
153 and the label will be appended to dgroup.array_labels
156 Notes
157 ------
158 1. The output array will be Sum[ roi*icr/(ocr*ltime) ]
159 2. The default label will be like the array label for the 'dtcN' + first ROI
160 3. icr, ocr, or ltime can be `None`, '1.0', '-1', or '1' to mean '1.0' or
161 arrays of indices for the respective components: must be the same lenght as roi
163 4. an array index of -1 will indicate 'bad channel' and be skipped for ROI
164 or set to 1.0 for icr, ocr, or ltime
166 5. if the list of arrays in roi, icr, ocr, or ltime are otherwise out-of-range,
167 the returned (label, data) will be (None, None)
169 """
170 nchans = len(roi)
171 if icr in ('1.0', -1, 1, None):
172 icr = [-1]*nchans
173 if ocr in ('1.0', -1, 1, None):
174 ocr = [-1]*nchans
175 if ltime in ('1.0', -1, 1, None):
176 ltime = [-1]*nchans
177 if len(ltime) != nchans or len(icr) != nchans or len(ocr) != nchans:
178 raise Value("arrays of indices for for roi, icr, ocr, and ltime must be the same length")
180 narr, npts = dgroup.data.shape
181 nused = 0
182 sum = 0.0
183 olabel = None
184 def get_data(arr, idx):
185 iarr = arr[idx]
186 if iarr < 0:
187 return iarr, 1.0
188 if iarr > narr-1:
189 return None, None
190 return iarr, dgroup.data[iarr, :]
192 for pchan in range(nchans):
193 droi = dicr = docr = dltime = 1.0
194 iarr, droi = get_data(roi, pchan)
195 if isinstance(droi, np.ndarray):
196 if olabel is None:
197 olabel = dgroup.array_labels[iarr]
198 elif iarr is None:
199 return (None, None)
200 else: # index of -1 here means "skip"
201 continue
203 iarr, dicr = get_data(icr, pchan)
204 if iarr is None: return (None, None)
206 iarr, docr = get_data(ocr, pchan)
207 if iarr is None: return (None, None)
209 iarr, docr = get_data(ocr, pchan)
210 if iarr is None: return (None, None)
212 iarr, dltime= get_data(ltime, pchan)
213 if iarr is None: return (None, None)
215 sum += droi*dicr/(docr*dltime)
216 nused += 1
218 if label is None:
219 if olabel is None: olabel = 'ROI'
220 label = olabel = f'dtc{nused}_{olabel}'
221 n = 1
222 while label in dgroup.array_labels:
223 n += 1
224 label = f'{olabel}_{n}'
225 if add_data:
226 dgroup.array_labels.append(label)
227 dgroup.data = np.append(dgroup.data, sum.reshape(1, len(sum)), axis=0)
228 return (label, sum)
232def read_ascii(filename, labels=None, simple_labels=False,
233 sort=False, sort_column=0):
234 """read a column ascii column file, returning a group
235 containing the data extracted from the file.
237 Arguments:
238 filename (str): name of file to read
239 labels (ist or None) : list of labels to use for array labels [None]
240 simple_labels (bool) : whether to force simple column labels (note 1) [False]
241 sort (bool) : whether to sort row data (note 2) [False]
242 sort_column (int) : column to use for sorting (note 2) [0]
244 Returns:
245 Group
247 A data group containing data read from file, with several attributes:
249 | filename : text name of the file.
250 | array_labels : array labels, names of 1-D arrays.
251 | data : 2-dimensional data (ncolumns, nrows) with all data.
252 | header : array of text lines of the header.
253 | footer : array of text lines of the footer (text after the numerical data)
254 | attrs : group of attributes parsed from header lines.
256 Notes:
257 1. array labels. If `labels` is `None` (the default), column labels
258 and names of 1d arrays will be guessed from the file header. This often
259 means parsing the final header line, but tagged column files from several XAFS
260 beamlines will be tried and used if matching. Column labels may be like 'col1',
261 'col2', etc if suitable column labels cannot be guessed.
262 These labels will be used as names for the 1-d arrays from each column.
263 If `simple_labels` is `True`, the names 'col1', 'col2' etc will be used
264 regardless of the column labels found in the file.
266 2. sorting. Data can be sorted to be in increasing order of any column,
267 by giving the column index (starting from 0).
269 3. header parsing. If header lines are of the forms of
271 | KEY : VAL
272 | KEY = VAL
274 these will be parsed into a 'attrs' dictionary in the returned group.
276 Examples:
278 >>> feo_data = read_ascii('feo_rt1.dat')
279 >>> show(g)a
280 == Group ascii_file feo_rt1.dat: 0 methods, 8 attributes ==
281 array_labels: ['energy', 'xmu', 'i0']
282 attrs: <Group header attributes from feo_rt1.dat>
283 data: array<shape=(3, 412), type=dtype('float64')>
284 energy: array<shape=(412,), type=dtype('float64')>
285 filename: 'feo_rt1.dat'
286 header: ['# room temperature FeO', '# data from 20-BM, 2001, as part of NXS school', ... ]
287 i0: array<shape=(412,), type=dtype('float64')>
288 xmu: array<shape=(412,), type=dtype('float64')>
290 See Also:
291 read_xdi, write_ascii
293 """
294 if not Path(filename).is_file():
295 raise OSError("File not found: '%s'" % filename)
296 if os.stat(filename).st_size > MAX_FILESIZE:
297 raise OSError("File '%s' too big for read_ascii()" % filename)
299 text = read_textfile(filename)
300 lines = text.split('\n')
302 ncol = None
303 data, footers, headers = [], [], []
305 lines.reverse()
306 section = 'FOOTER'
308 for line in lines:
309 line = line.strip()
310 if len(line) < 1:
311 continue
312 # look for section transitions (going from bottom to top)
313 if section == 'FOOTER' and not None in getfloats(line):
314 section = 'DATA'
315 elif section == 'DATA' and None in getfloats(line):
316 section = 'HEADER'
318 # act of current section:
319 if section == 'FOOTER':
320 footers.append(line)
321 elif section == 'HEADER':
322 headers.append(line)
323 elif section == 'DATA':
324 rowdat = getfloats(line)
325 if ncol is None:
326 ncol = len(rowdat)
327 elif ncol > len(rowdat):
328 rowdat.extend([np.nan]*(ncol-len(rowdat)))
329 elif ncol < len(rowdat):
330 for i in data:
331 i.extend([np.nan]*(len(rowdat)-ncol))
332 ncol = len(rowdat)
333 data.append(rowdat)
335 # reverse header, footer, data, convert to arrays
336 footers.reverse()
337 headers.reverse()
338 data.reverse()
339 data = np.array(data).transpose()
341 # try to parse attributes from header text
342 header_attrs = {}
343 for hline in headers:
344 hline = hline.strip().replace('\t', ' ')
345 if len(hline) < 1: continue
346 if hline[0] in COMMENTCHARS:
347 hline = hline[1:].strip()
348 keywds = []
349 if ':' in hline: # keywords in 'x: 22'
350 words = hline.split(':', 1)
351 keywds = words[0].split()
352 elif '=' in hline: # keywords in 'x = 22'
353 words = hline.split('=', 1)
354 keywds = words[0].split()
355 if len(keywds) == 1:
356 key = colname(keywds[0])
357 if key.startswith('_'):
358 key = key[1:]
359 if len(words) > 1:
360 header_attrs[key] = words[1].strip()
363 fpath = Path(filename).absolute()
364 filename = fpath.as_posix()
365 attrs = {'filename': filename}
366 group = Group(name='ascii_file %s' % filename,
367 path=filename, filename=fpath.name,
368 header=headers, data=[], array_labels=[])
370 if len(data) == 0:
371 return group
373 if sort and sort_column >= 0 and sort_column < ncol:
374 data = data[:, np.argsort(data[sort_column])]
376 group.data = data
378 if len(footers) > 0:
379 group.footer = footers
381 group.attrs = Group(name='header attributes from %s' % filename)
382 for key, val in header_attrs.items():
383 setattr(group.attrs, key, val)
385 if isinstance(labels, str):
386 for bchar in ',#@%|:*':
387 labels = labels.replace(bchar, '')
388 labels = labels.split()
389 if labels is None and not simple_labels:
390 bldat = guess_beamline(headers)(headers)
391 labels = bldat.get_array_labels()
393 if getattr(bldat, 'energy_units', 'eV') != 'eV':
394 group.energy_units = bldat.energy_units
395 if getattr(bldat, 'energy_column', 1) != 1:
396 group.energy_column = bldat.energy_column
397 if getattr(bldat, 'mono_dspace', -1) > 0:
398 group.mono_dspace = bldat.mono_dspace
400 set_array_labels(group, labels=labels, simple_labels=simple_labels)
401 return group
403def set_array_labels(group, labels=None, simple_labels=False,
404 save_oldarrays=False):
406 """set array names for a group from its 2D `data` array.
408 Arguments
409 ----------
410 labels (list of strings or None) array of labels to use
411 simple_labels (bool): flag to use ('col1', 'col2', ...) [False]
412 save_oldarrays (bool): flag to save old array names [False]
415 Returns
416 -------
417 group with newly named attributes of 1D array data, and
418 an updated `array_labels` giving the mapping of `data`
419 columns to attribute names.
421 Notes
422 ------
423 1. if `simple_labels=True` it will overwrite any values in `labels`
425 3. Array labels must be valid python names. If not enough labels
426 are specified, or if name clashes arise, the array names may be
427 modified, often by appending an underscore and letter or by using
428 ('col1', 'col2', ...) etc.
430 4. When `save_oldarrays` is `False` (the default), arrays named in the
431 current `group.array_labels` will be erased. Other arrays and
432 attributes will not be changed.
434 """
435 write = sys.stdout.write
436 if not hasattr(group, 'data'):
437 write("cannot set array labels for group '%s': no `data`\n" % repr(group))
438 return
440 # clear old arrays, if desired
441 oldlabels = getattr(group, 'array_labels', None)
442 if oldlabels is not None and not save_oldarrays:
443 for attr in oldlabels:
444 if hasattr(group, attr):
445 delattr(group, attr)
447 ncols, nrow = group.data.shape
449 ####
450 # step 1: determine user-defined labels from input options
451 # generating array `tlabels` for test labels
452 #
453 # generate simple column labels, used as backup
454 clabels = ['col%d' % (i+1) for i in range(ncols)]
456 if isinstance(labels, str):
457 labels = labels.split()
460 tlabels = labels
461 # if simple column names requested or above failed, use simple column names
462 if simple_labels or tlabels is None:
463 tlabels = clabels
465 ####
466 # step 2: check input and correct problems
467 # 2.a: check for not enough and too many labels
468 if len(tlabels) < ncols:
469 for i in range(len(tlabels), ncols):
470 tlabels.append("col%i" % (i+1))
471 elif len(tlabels) > ncols:
472 tlabels = tlabels[:ncols]
474 # 2.b: check for names that clash with group attributes
475 # or that are repeated, append letter.
476 reserved_names = ('data', 'array_labels', 'filename',
477 'attrs', 'header', 'footer')
478 extras = string.ascii_lowercase
479 labels = []
480 for i in range(ncols):
481 lname = tlabels[i]
482 if lname in reserved_names or lname in labels:
483 lname = lname + '_a'
484 j = 0
485 while lname in labels:
486 j += 1
487 if j == len(extras):
488 break
489 lname = "%s_%s" % (tlabels[i], extras[j])
490 if lname in labels:
491 lname = clabels[i]
492 labels.append(lname)
494 ####
495 # step 3: assign attribue names, set 'array_labels'
496 for i, name in enumerate(labels):
497 setattr(group, name, group.data[i])
498 group.array_labels = labels
499 return group
502def write_ascii(filename, *args, commentchar='#', label=None, header=None):
503 """
504 write a list of items to an ASCII column file
506 Arguments:
507 args (list of groups): list of groups to write.
508 commentchar (str) : character for comment ('#')
509 label (str on None): array label line (autogenerated)
510 header (list of strings): array of strings for header
512 Returns:
513 None
515 Examples:
516 >>> write_ascii('myfile', group.energy, group.norm, header=['comment1', 'comment2']
518 """
519 ARRAY_MINLEN = 2
520 write = sys.stdout.write
521 com = commentchar
522 label = label
523 if header is None:
524 header = []
526 arrays = []
527 arraylen = None
529 for arg in args:
530 if isinstance(arg, np.ndarray):
531 if len(arg) > ARRAY_MINLEN:
532 if arraylen is None:
533 arraylen = len(arg)
534 else:
535 arraylen = min(arraylen, len(arg))
536 arrays.append(arg)
537 else:
538 header.append(repr(arg))
540 else:
541 header.append(repr(arg))
543 if arraylen is None:
544 raise ValueError("write_ascii() need %i or more elements in arrays." % ARRAY_MINLEN)
546 buff = []
547 if header is None:
548 buff = ['%s Output from Larch %s' % (com, time.ctime())]
549 for s in header:
550 buff.append('%s %s' % (com, s))
551 buff.append('%s---------------------------------'% com)
552 if label is None:
553 label = (' '*13).join(['col%d' % (i+1) for i in range(len(arrays))])
554 buff.append('# %s' % label)
556 arrays = np.array(arrays)
557 for i in range(arraylen):
558 w = [" %s" % gformat(val[i], length=14) for val in arrays]
559 buff.append(' '.join(w))
560 buff.append('')
562 with open(filename, 'w', encoding=sys.getdefaultencoding()) as fout:
563 fout.write('\n'.join(buff))
564 sys.stdout.write("wrote to file '%s'\n" % filename)
567def write_group(filename, group, scalars=None, arrays=None,
568 arrays_like=None, commentchar='#'):
569 """(deprecated) write components of a group to an ASCII column file
572 Warning:
573 This is pretty minimal and may work poorly for large groups of complex data.
574 Use `save_session` instead.
576 """
578 items = dir(group)
579 npts = 0
580 if arrays is None:
581 arrays = []
582 if scalars is None:
583 scalars = []
585 if arrays_like is not None and arrays_like in items:
586 array = getattr(group, arrays_like)
587 if isinstance(array, np.ndarray):
588 npts = len(array)
590 for name in items:
591 val = getattr(group, name)
592 if isinstance(val, np.ndarray):
593 if npts != 0 and npts == len(val) and name not in arrays:
594 arrays.append(name)
596 header =[]
597 for s in scalars:
598 if s in items:
599 val = getattr(group, s)
600 header.append("%s = %s" % (s, val))
602 label = ' '.join(arrays)
604 args = []
605 for name in arrays:
606 if name in items:
607 args.append(getattr(group, name))
609 write_ascii(filename, *args, commentchar=commentchar,
610 label=label, header=header)
613def read_fdmnes(filename, **kwargs):
614 """read [FDMNES](http://fdmnes.neel.cnrs.fr/) ascii files"""
615 group = read_ascii(filename, **kwargs)
616 group.header_dict = dict(filetype='FDMNES', energy_units='eV')
617 for headline in group.header:
618 if ("E_edge" in headline):
619 if headline.startswith("#"):
620 headline = headline[1:]
621 vals = [float(v) for v in headline.split(" = ")[0].split(" ") if v]
622 vals_names = headline.split(" = ")[1].split(", ")
623 group.header_dict.update(dict(zip(vals_names, vals)))
624 group.name = f'FDMNES file {filename}'
625 group.energy += group.header_dict["E_edge"]
626 #fix _arrlabel -> arrlabel
627 for ilab, lab in enumerate(group.array_labels):
628 if lab.startswith("_"):
629 fixlab = lab[1:]
630 group.array_labels[ilab] = fixlab
631 delattr(group, lab)
632 setattr(group, fixlab, group.data[ilab])
633 return group
635def guess_filereader(path, return_text=False):
636 """guess function name to use to read a data file based on the file header
638 Arguments
639 ---------
640 path (str) : file path to be read
642 Returns
643 -------
644 name of function (as a string) to use to read file
645 if return_text: text of the read file
646 """
647 text = read_textfile(path)
648 lines = text.split('\n')
649 line1 = lines[0].lower()
650 reader = 'read_ascii'
651 if 'epics scan' in line1:
652 reader = 'read_gsescan'
653 if 'xdi' in line1:
654 reader = 'read_xdi'
655 if 'epics stepscan file' in line1 :
656 reader = 'read_gsexdi'
657 if ("#s" in line1) or ("#f" in line1):
658 reader = 'read_specfile'
659 if 'fdmnes' in line1:
660 reader = 'read_fdmnes'
661 if return_text:
662 return reader, text
663 else:
664 return reader