qmcpack/nexus/lib/hdfreader.py

390 lines
11 KiB
Python

##################################################################
## (c) Copyright 2015- by Jaron T. Krogel ##
##################################################################
#====================================================================#
# hdfreader.py #
# Support for reading HDF5 files into local structured format #
# containing numpy arrays. #
# #
# Content summary: #
# HDFreader #
# Main class to read HDF files and convert to object format. #
# #
# HDFgroup #
# Class representing an HDF group. #
# Contains other HDFgroup's or named data as numpy arrays #
# #
#====================================================================#
from numpy import array,ndarray,minimum,abs,ix_,resize
import sys
import keyword
from inspect import getmembers
from superstring import valid_variable_name
from generic import obj
from developer import DevBase,unavailable
try:
import h5py
except:
h5py = unavailable('h5py')
#end try
from debug import *
class HDFglobals(DevBase):
view = False
#end class HDFglobals
class HDFgroup(DevBase):
def _escape_name(self,name):
if name in self._escape_names:
name=name+'_'
#end if
return name
#end def escape_name
def _set_parent(self,parent):
self._parent=parent
return
#end def set_parent
def _add_dataset(self,name,dataset):
self._datasets[name]=dataset
return
#end def add_dataset
def _add_group(self,name,group):
group._name=name
self._groups[name]=group
return
#end def add_group
def _contains_group(self,name):
return name in self._groups.keys()
#end def _contains_group
def _contains_dataset(self,name):
return name in self._datasets.keys()
#end def _contains_dataset
def _to_string(self):
s=''
if len(self._datasets)>0:
s+=' datasets:\n'
for k,v in self._datasets.items():
s+= ' '+k+'\n'
#end for
#end if
if len(self._groups)>0:
s+= ' groups:\n'
for k,v in self._groups.items():
s+= ' '+k+'\n'
#end for
#end if
return s
#end def list
# def __str__(self):
# return self._to_string()
# #end def __str__
#
# def __repr__(self):
# return self._to_string()
# #end def __repr__
def __init__(self):
self._name=''
self._parent=None
self._groups={};
self._datasets={};
self._group_counts={}
self._escape_names=None
self._escape_names=set(dict(getmembers(self)).keys()) | set(keyword.kwlist)
return
#end def __init__
def _remove_hidden(self,deep=True):
if '_parent' in self:
del self._parent
#end if
if deep:
for name,value in self.items():
if isinstance(value,HDFgroup):
value._remove_hidden()
#end if
#end for
#end if
for name in list(self.keys()):
if name[0]=='_':
del self[name]
#end if
#end for
#end def _remove_hidden
# read in all data views (h5py datasets) into arrays
# useful for converting a single group read in view form to full arrays
def read_arrays(self):
self._remove_hidden()
for k,v in self.items():
if isinstance(v,HDFgroup):
v.read_arrays()
else:
self[k] = array(v)
#end if
#end for
#end def read_arrays
def get_keys(self):
if '_groups' in self:
keys = list(self._groups.keys())
else:
keys = list(self.keys())
#end if
return keys
#end def get_keys
#project interface methods
def zero(self,*names):
for name in names:
if name in self and isinstance(self[name],ndarray):
self[name][:] = 0
#end if
#end for
for name in self.get_keys():
value = self[name]
if isinstance(value,HDFgroup):
value.zero(*names)
#end if
#end for
#self.sum(*names)
#end def zero
def minsize(self,other,*names):
name_set = set(names)
snames = set(self.keys()) & name_set
onames = set(other.keys()) & name_set
if snames==onames:
for name in snames:
svalue = self[name]
ovalue = other[name]
if not isinstance(svalue,ndarray) or not isinstance(ovalue,ndarray):
self.error(name+' is not an array')
#end if
shape = minimum(svalue.shape,ovalue.shape)
self[name] = resize(svalue,shape)
#end for
#end if
for name in self.get_keys():
value = self[name]
if isinstance(value,HDFgroup):
if name in other and isinstance(other[name],HDFgroup):
value.minsize(other[name])
else:
self.error(name+' not found in minsize partner')
#end if
#end if
#end for
#self.sum(*names)
#end def minsize
def accumulate(self,other,*names):
name_set = set(names)
snames = set(self.keys()) & name_set
onames = set(other.keys()) & name_set
if snames==onames:
for name in snames:
svalue = self[name]
ovalue = other[name]
if not isinstance(svalue,ndarray) or not isinstance(ovalue,ndarray):
self.error(name+' is not an array')
#end if
shape = minimum(svalue.shape,ovalue.shape)
if abs(shape-array(svalue.shape)).sum() > 0:
self.error(name+' in partner is too large')
#end if
ranges = []
for s in shape:
ranges.append(range(s))
#end for
#add the part of the other data that fits into own data
svalue += ovalue[ix_(*ranges)]
#end for
#end if
for name in self.get_keys():
value = self[name]
if isinstance(value,HDFgroup):
if name in other and isinstance(other[name],HDFgroup):
value.accumulate(other[name])
else:
self.error(name+' not found in accumulate partner')
#end if
#end if
#end for
#self.sum(*names)
#end def accumulate
def normalize(self,normalization,*names):
for name in names:
if name in self and isinstance(self[name],ndarray):
self[name] /= normalization
#end if
#end for
for name in self.get_keys():
value = self[name]
if isinstance(value,HDFgroup):
value.normalize(normalization,*names)
#end if
#end for
#self.sum(*names)
#end def normalize
def sum(self,*names):
for name in names:
if name in self and isinstance(self[name],ndarray) and name=='value':
s = self[name].mean(0).sum()
#end if
#end for
#end def sum
#end class HDFgroup
class HDFreader(DevBase):
def __init__(self,fpath,verbose=False,view=False):
HDFglobals.view = view
if verbose:
print(' Initializing HDFreader')
#end if
self.fpath=fpath
if verbose:
print(' loading h5 file')
#end if
try:
self.hdf = h5py.File(fpath,'r')
except IOError:
self._success = False
self.hdf = obj(obj=obj())
else:
self._success = True
#end if
if verbose:
print(' converting h5 file to dynamic object')
#end if
#convert the hdf 'dict' into a dynamic object
self.nlevels=1
self.ilevel=0
# Set the current hdf group
self.obj = HDFgroup()
self.cur=[self.obj]
self.hcur=[self.hdf]
if self._success:
cur = self.cur[self.ilevel]
hcur = self.hcur[self.ilevel]
for kr,v in hcur.items():
k=cur._escape_name(kr)
if valid_variable_name(k):
if isinstance(v, h5py.Dataset):
self.add_dataset(cur,k,v)
elif isinstance(v, h5py.Group):
self.add_group(hcur,cur,k,v)
else:
self.error('encountered invalid type: '+str(type(v)))
else:
self.warn('attribute '+k+' is not a valid variable name and has been ignored')
#end if
#end for
#end if
if verbose:
print(' end HDFreader Initialization')
#end if
return
#end def __init__
def increment_level(self):
self.ilevel+=1
self.nlevels = max(self.ilevel+1,self.nlevels)
if self.ilevel+1==self.nlevels:
self.cur.append(None)
self.hcur.append(None)
#end if
self.pad = self.ilevel*' '
return
#end def increment_level
def decrement_level(self):
self.ilevel-=1
self.pad = self.ilevel*' '
return
#end def decrement_level
def add_dataset(self,cur,k,v):
if not HDFglobals.view:
cur[k]=array(v)
else:
cur[k] = v
#end if
cur._add_dataset(k,cur[k])
return
#end def add_dataset
def add_group(self,hcur,cur,k,v):
cur[k] = HDFgroup()
cur._add_group(k,cur[k])
cur._groups[k]._parent = cur
self.increment_level()
self.cur[self.ilevel] = cur._groups[k]
self.hcur[self.ilevel] = hcur[k]
cur = self.cur[self.ilevel]
hcur = self.hcur[self.ilevel]
for kr,v in hcur.items():
k=cur._escape_name(kr)
if valid_variable_name(k):
if isinstance(v, h5py.Dataset):
self.add_dataset(cur,k,v)
elif isinstance(v, h5py.Group):
self.add_group(hcur,cur,k,v)
#end if
else:
self.warn('attribute '+k+' is not a valid variable name and has been ignored')
#end if
#end for
return
#end def add_group
#end class HDFreader
def read_hdf(fpath,verbose=False,view=False):
return HDFreader(fpath=fpath,verbose=verbose,view=view).obj
#end def read_hdf