HDF5完整文件结构与操作指南-程序员充电站

HDF5完整文件结构与操作指南

1. 完整文件结构概览

1.1 理想的HDF5文件结构

comprehensive_example.h5 # 根文件 │ ├── 📁 metadata/ # 元数据组 │ ├── attrs: {title, author, version} # 组属性 │ ├── 📄 description (string) # 描述文本 │ ├── 📄 creation_date (datetime) # 创建日期 │ └── 📄 parameters (structured) # 参数结构体 │ ├── 📁 raw_data/ # 原始数据组 │ ├── 📄 sensor_1 (1D float32) # 1维数据 │ ├── 📄 sensor_2 (1D float32) │ ├── 📄 images (3D uint8) # 3维图像数据 │ │ ├── attrs: {resolution, units} # 数据集属性 │ │ └── dims: [time, height, width] # 维度标签 │ └── 📄 measurements (2D float64) # 2维测量数据 │ ├── 📁 processed_data/ # 处理后数据组 │ ├── 📄 filtered (2D compressed) # 压缩数据集 │ ├── 📄 normalized (2D chunked) # 分块数据集 │ └── 📄 statistics (compound type) # 复合数据类型 │ ├── 📁 models/ # 模型组 │ ├── 📁 neural_network/ # 神经网络子组 │ │ ├── 📄 layer_1_weights (2D) │ │ ├── 📄 layer_1_biases (1D) │ │ ├── 📄 layer_2_weights (2D) │ │ └── 📄 layer_2_biases (1D) │ └── 📁 config/ │ └── 📄 hyperparameters (JSON) │ ├── 📁 time_series/ # 时间序列数据 │ ├── 📄 data (resizable 2D) # 可扩展数据集 │ ├── 📄 timestamps (1D datetime) │ └── 📄 labels (1D categorical) │ ├── 📁 references/ # 引用和链接 │ ├── 🔗 link_to_raw -> /raw_data # 软链接 │ ├── 🔗 external_link -> file.h5:/data # 外部链接 │ └── 📄 object_references (refs) # 对象引用 │ ├── 📁 special_types/ # 特殊数据类型 │ ├── 📄 string_array (variable length)# 变长字符串 │ ├── 📄 boolean_mask (bool) # 布尔类型 │ ├── 📄 enum_data (enum) # 枚举类型 │ ├── 📄 complex_numbers (complex) # 复数 │ └── 📄 nested_compound (nested) # 嵌套结构体 │ └── 📁 large_data/ # 大数据集 ├── 📄 chunked_compressed (gzip) # 分块+压缩 ├── 📄 lzf_compressed (lzf) # LZF压缩 └── 📄 virtual_dataset (virtual) # 虚拟数据集

2. 基础数据集类型

2.1 数值类型数据集

2.1.1 整数类型

importh5pyimportnumpyasnpwithh5py.File('example.h5','w')asf:# 有符号整数f.create_dataset('int8_data',data=np.array([1,2,3],dtype=np.int8))f.create_dataset('int16_data',data=np.array([100,200],dtype=np.int16))f.create_dataset('int32_data',data=np.array([1000,2000],dtype=np.int32))f.create_dataset('int64_data',data=np.array([10000,20000],dtype=np.int64))# 无符号整数f.create_dataset('uint8_data',data=np.array([255,128],dtype=np.uint8))f.create_dataset('uint16_data',data=np.array([65535],dtype=np.uint16))f.create_dataset('uint32_data',data=np.array([4294967295],dtype=np.uint32))f.create_dataset('uint64_data',data=np.array([2**63],dtype=np.uint64))# 读取操作withh5py.File('example.h5','r')asf:int8_data=f['int8_data'][:]print(f"数据类型:{int8_data.dtype}")print(f"数据:{int8_data}")

2.1.2 浮点类型

withh5py.File('example.h5','w')asf:# 单精度浮点f.create_dataset('float32_data',data=np.array([3.14,2.71],dtype=np.float32))# 双精度浮点f.create_dataset('float64_data',data=np.array([3.141592653589793],dtype=np.float64))# 半精度浮点（节省空间）f.create_dataset('float16_data',data=np.array([1.5,2.5],dtype=np.float16))# 读取并查看精度withh5py.File('example.h5','r')asf:fornamein['float32_data','float64_data','float16_data']:data=f[name][:]print(f"{name}: dtype={data.dtype}, precision={data.itemsize*8}bits")

2.1.3 复数类型

withh5py.File('example.h5','w')asf:# 复数类型complex_data=np.array([1+2j,3+4j,5+6j],dtype=np.complex64)f.create_dataset('complex64',data=complex_data)# 双精度复数complex_data_high=np.array([1+2j,3+4j],dtype=np.complex128)f.create_dataset('complex128',data=complex_data_high)# 读取和处理复数withh5py.File('example.h5','r')asf:c_data=f['complex64'][:]print(f"实部:{c_data.real}")print(f"虚部:{c_data.imag}")print(f"模:{np.abs(c_data)}")print(f"相位:{np.angle(c_data)}")

2.1.4 布尔类型

withh5py.File('example.h5','w')asf:# 布尔数组bool_data=np.array([True,False,True,True],dtype=bool)f.create_dataset('boolean_mask',data=bool_data)# 布尔矩阵（用于掩码）bool_matrix=np.random.rand(100,100)>0.5f.create_dataset('random_mask',data=bool_matrix)# 读取和应用掩码withh5py.File('example.h5','r')asf:mask=f['random_mask'][:]# 可以用于过滤数据data=np.random.randn(100,100)filtered_data=data[mask]print(f"掩码选中了{mask.sum()}个元素")

2.2 字符串类型数据集

2.2.1 固定长度字符串

withh5py.File('example.h5','w')asf:# 固定长度ASCII字符串fixed_strings=np.array([b'hello',b'world',b'test'],dtype='S10')f.create_dataset('fixed_ascii',data=fixed_strings)# 固定长度Unicode字符串fixed_unicode=np.array(['你好','世界','测试'],dtype='U10')f.create_dataset('fixed_unicode',data=fixed_unicode)# 读取字符串withh5py.File('example.h5','r')asf:ascii_data=f['fixed_ascii'][:]unicode_data=f['fixed_unicode'][:]print(f"ASCII:{ascii_data}")print(f"Unicode:{unicode_data}")

2.2.2 变长字符串

withh5py.File('example.h5','w')asf:# 变长ASCII字符串dt_ascii=h5py.string_dtype(encoding='ascii')var_strings=['short','a very long string','medium']f.create_dataset('variable_ascii',data=var_strings,dtype=dt_ascii)# 变长UTF-8字符串dt_utf8=h5py.string_dtype(encoding='utf-8')var_unicode=['短','这是一个很长的中文字符串','中等长度']f.create_dataset('variable_utf8',data=var_unicode,dtype=dt_utf8)# 读取变长字符串withh5py.File('example.h5','r')asf:var_ascii=f['variable_ascii'][:]var_utf8=f['variable_utf8'][:]print(f"变长ASCII:{var_ascii}")print(f"变长UTF-8:{var_utf8}")# 单个元素访问print(f"第一个元素:{f['variable_utf8'][0]}")

2.3 多维数组

2.3.1 一维数组（向量）

withh5py.File('example.h5','w')asf:# 时间序列数据time_series=np.sin(np.linspace(0,10*np.pi,1000))dset=f.create_dataset('time_series',data=time_series)dset.attrs['description']='Sine wave'dset.attrs['sampling_rate']=100# Hz# 读取和处理withh5py.File('example.h5','r')asf:ts=f['time_series']print(f"形状:{ts.shape}")print(f"长度:{len(ts)}")print(f"采样率:{ts.attrs['sampling_rate']}Hz")# 切片读取first_100=ts[:100]last_100=ts[-100:]

2.3.2 二维数组（矩阵）

withh5py.File('example.h5','w')asf:# 图像数据（灰度图）image=np.random.randint(0,256,(512,512),dtype=np.uint8)dset=f.create_dataset('grayscale_image',data=image)dset.attrs['height']=512dset.attrs['width']=512dset.attrs['channels']=1# 表格数据table_data=np.random.randn(1000,50)# 1000行，50列f.create_dataset('table_data',data=table_data)# 读取操作withh5py.File('example.h5','r')asf:# 读取整个图像img=f['grayscale_image'][:]# 读取图像的一部分（ROI）roi=f['grayscale_image'][100:200,100:200]# 读取表格的特定行列col_5=f['table_data'][:,5]# 第5列row_10=f['table_data'][10,:]# 第10行subset=f['table_data'][0:100,0:10]# 子集

2.3.3 三维数组（体数据）

withh5py.File('example.h5','w')asf:# RGB图像序列video=np.random.randint(0,256,(100,480,640,3),dtype=np.uint8)dset=f.create_dataset('video_rgb',data=video)dset.attrs['num_frames']=100dset.attrs['height']=480dset.attrs['width']=640dset.attrs['channels']=3dset.attrs['fps']=30# 3D医学图像（CT扫描）ct_scan=np.random.randn(256,256,128)# [x, y, z]f.create_dataset('ct_scan',data=ct_scan)# 读取操作withh5py.File('example.h5','r')asf:# 读取特定帧frame_10=f['video_rgb'][10,:,:,:]# 读取时间切片time_slice=f['video_rgb'][0:50:5,:,:,:]# 每5帧取一帧# 读取空间切片spatial_slice=f['video_rgb'][:,100:200,200:300,:]

2.3.4 四维及更高维数组

withh5py.File('example.h5','w')asf:# 4D: [batch, height, width, channels]batch_images=np.random.randn(32,224,224,3)f.create_dataset('image_batch',data=batch_images)# 5D: [time, batch, height, width, channels]video_batch=np.random.randn(10,16,64,64,3)f.create_dataset('video_batch',data=video_batch)# 读取操作withh5py.File('example.h5','r')asf:# 读取特定批次batch_0=f['image_batch'][0,:,:,:]# 复杂切片subset=f['video_batch'][0:5,0:8,::2,::2,:]# 降采样

3. 组结构操作

3.1 创建和组织组

withh5py.File('example.h5','w')asf:# 方法1: 直接创建组group1=f.create_group('level1')# 方法2: 创建嵌套组group2=f.create_group('level1/level2')# 方法3: 使用require_group（存在则返回，不存在则创建）group3=f.require_group('level1/level2/level3')# 在组中创建数据集group1.create_dataset('data1',data=np.arange(10))group2.create_dataset('data2',data=np.arange(20))group3.create_dataset('data3',data=np.arange(30))# 创建多个平行组foriinrange(5):group=f.create_group(f'experiment_{i}')group.create_dataset('results',data=np.random.randn(100))group.attrs['experiment_id']=i group.attrs['timestamp']=f'2024-01-{i+1:02d}'

3.2 遍历组结构

defprint_structure(name,obj):"""递归打印HDF5结构"""indent=' '*name.count('/')ifisinstance(obj,h5py.Group):print(f"{indent}📁{name}/")elifisinstance(obj,h5py.Dataset):print(f"{indent}📄{name}{obj.shape}{obj.dtype}")withh5py.File('example.h5','r')asf:print("方法1: visititems")f.visititems(print_structure)print("\n方法2: 手动遍历")defrecursive_print(group,level=0):forkeyingroup.keys():item=group[key]indent=' '*levelifisinstance(item,h5py.Group):print(f"{indent}📁{key}/")recursive_print(item,level+1)else:print(f"{indent}📄{key}{item.shape}")recursive_print(f)print("\n方法3: 只遍历特定组")forkeyinf['level1'].keys():print(f"Found:{key}")

3.3 移动、复制和删除组

withh5py.File('example.h5','a')asf:# 复制组f.copy('level1','level1_copy')# 移动组（重命名）f.move('level1_copy','level1_backup')# 删除组delf['level1_backup']# 复制到另一个文件withh5py.File('destination.h5','w')asf_dest:f.copy('level1',f_dest,name='imported_data')

3.4 按条件查找数据集

deffind_datasets(group,condition):"""查找满足条件的数据集"""results=[]defsearch(name,obj):ifisinstance(obj,h5py.Dataset):ifcondition(name,obj):results.append(name)group.visititems(search)returnresultswithh5py.File('example.h5','r')asf:# 查找所有浮点数据集float_datasets=find_datasets(f,lambdaname,obj:obj.dtype.kind=='f')# 查找大于特定大小的数据集large_datasets=find_datasets(f,lambdaname,obj:obj.size>1000)# 查找包含特定属性的数据集with_attr=find_datasets(f,lambdaname,obj:'experiment_id'inobj.attrs)print(f"浮点数据集:{float_datasets}")print(f"大数据集:{large_datasets}")print(f"有experiment_id属性:{with_attr}")

4. 属性系统

4.1 文件级属性

withh5py.File('example.h5','w')asf:# 字符串属性f.attrs['title']='My Research Data'f.attrs['author']='Josh Witt'f.attrs['institution']='University'# 数值属性f.attrs['version']=1.0f.attrs['year']=2024# 数组属性f.attrs['dimensions']=[1024,768]f.attrs['channels']=[0,1,2]# 日期时间（存储为字符串）fromdatetimeimportdatetime f.attrs['created']=datetime.now().isoformat()# 布尔属性f.attrs['is_validated']=Truef.attrs['is_published']=False# 读取文件属性withh5py.File('example.h5','r')asf:print("文件属性:")forkey,valueinf.attrs.items():print(f"{key}:{value}")

4.2 组级属性

withh5py.File('example.h5','w')asf:# 为不同的实验组添加元数据forexp_idinrange(3):group=f.create_group(f'experiment_{exp_id}')# 实验参数group.attrs['temperature']=20+exp_id*5# ℃group.attrs['pressure']=1.0+exp_id*0.1# atmgroup.attrs['duration']=3600# seconds# 实验状态group.attrs['status']='completed'group.attrs['quality_score']=0.95# 参考信息group.attrs['reference_paper']='Smith et al., 2024'group.attrs['doi']=f'10.1234/journal.{exp_id}'# 读取和筛选withh5py.File('example.h5','r')asf:# 找出所有高温实验high_temp_exps=[]forkeyinf.keys():iff[key].attrs.get('temperature',0)>25:high_temp_exps.append(key)print(f"高温实验:{high_temp_exps}")

4.3 数据集级属性

withh5py.File('example.h5','w')asf:# 创建数据集并添加详细元数据data=np.random.randn(1000,100)dset=f.create_dataset('measurements',data=data)# 物理单位和量程dset.attrs['units']='meters per second'dset.attrs['range']=[data.min(),data.max()]dset.attrs['mean']=data.mean()dset.attrs['std']=data.std()# 采集信息dset.attrs['sampling_rate']=1000# Hzdset.attrs['num_channels']=100dset.attrs['calibration_factor']=1.05# 质量控制dset.attrs['outliers_removed']=5dset.attrs['missing_values']=0dset.attrs['validated']=True# 处理历史dset.attrs['preprocessing']='bandpass filter 0.1-100 Hz'dset.attrs['detrended']=True# 使用属性进行数据处理withh5py.File('example.h5','r')asf:dset=f['measurements']# 根据属性校准数据calibration=dset.attrs['calibration_factor']data=dset[:]*calibration# 显示统计信息print(f"单位:{dset.attrs['units']}")print(f"均值:{dset.attrs['mean']:.2f}")print(f"标准差:{dset.attrs['std']:.2f}")

4.4 修改和删除属性

withh5py.File('example.h5','a')asf:dset=f['measurements']# 修改现有属性dset.attrs['version']=2.0# 添加新属性dset.attrs['last_modified']=datetime.now().isoformat()# 删除属性if'temporary'indset.attrs:deldset.attrs['temporary']# 批量更新属性new_attrs={'processed':True,'algorithm':'FFT','window':'Hamming'}dset.attrs.update(new_attrs)

5. 高级数据类型

5.1 复合数据类型（结构体）

# 定义复合数据类型dt=np.dtype([('name','S50'),# 固定长度字符串('age','i4'),# 32位整数('height','f4'),# 32位浮点('weight','f4'),('is_active','?'),# 布尔('scores','f4',(3,))# 固定长度数组])withh5py.File('example.h5','w')asf:# 创建结构化数据集data=np.array([(b'Alice',25,165.5,55.2,True,[90,85,88]),(b'Bob',30,175.0,70.5,True,[78,82,80]),(b'Charlie',28,180.2,75.0,False,[95,92,89])],dtype=dt)dset=f.create_dataset('people',data=data)dset.attrs['description']='Personnel records'# 读取和访问withh5py.File('example.h5','r')asf:data=f['people'][:]# 访问特定字段names=data['name']ages=data['age']scores=data['scores']# 访问特定记录first_person=data[0]print(f"第一个人:{first_person['name']}, 年龄:{first_person['age']}")# 筛选数据active_people=data[data['is_active']]adults=data[data['age']>=18]

5.2 嵌套复合类型

# 定义嵌套结构address_dtype=np.dtype([('street','S100'),('city','S50'),('zipcode','i4')])person_dtype=np.dtype([('id','i4'),('name','S50'),('address',address_dtype),# 嵌套结构('salary','f8')])withh5py.File('example.h5','w')asf:data=np.array([(1,b'Alice',(b'123 Main St',b'Boston',12345),75000.0),(2,b'Bob',(b'456 Oak Ave',b'NYC',10001),85000.0)],dtype=person_dtype)f.create_dataset('employees',data=data)# 读取嵌套数据withh5py.File('example.h5','r')asf:data=f['employees'][:]# 访问嵌套字段cities=data['address']['city']zipcodes=data['address']['zipcode']print(f"员工城市:{cities}")print(f"邮编:{zipcodes}")

5.3 枚举类型

# 创建枚举类型status_enum=h5py.enum_dtype({'PENDING':0,'RUNNING':1,'COMPLETED':2,'FAILED':3},basetype='i')withh5py.File('example.h5','w')asf:# 使用枚举类型statuses=np.array([0,1,2,1,2,3],dtype=status_enum)dset=f.create_dataset('task_status',data=statuses)# 读取枚举withh5py.File('example.h5','r')asf:statuses=f['task_status'][:]# 统计各状态数量unique,counts=np.unique(statuses,return_counts=True)forval,countinzip(unique,counts):print(f"状态{val}:{count}个")

5.4 变长数据类型

withh5py.File('example.h5','w')asf:# 变长整数数组vlen_int=h5py.vlen_dtype(np.dtype('int32'))data=np.array([[1,2,3],[4,5],[6,7,8,9,10]],dtype=object)f.create_dataset('variable_length_arrays',data=data,dtype=vlen_int)# 变长浮点数组vlen_float=h5py.vlen_dtype(np.dtype('float64'))ragged_data=np.array([[1.1,2.2],[3.3,4.4,5.5,6.6],[7.7]],dtype=object)f.create_dataset('ragged_arrays',data=ragged_data,dtype=vlen_float)# 读取变长数据withh5py.File('example.h5','r')asf:vlen_data=f['variable_length_arrays'][:]fori,arrinenumerate(vlen_data):print(f"行{i}: 长度={len(arr)}, 数据={arr}")

6. 引用和链接

6.1 软链接（Soft Links）

withh5py.File('example.h5','w')asf:# 创建原始数据data=np.arange(100)f.create_dataset('data/original',data=data)# 创建软链接f['link_to_original']=h5py.SoftLink('/data/original')# 在其他组中创建链接f.create_group('analysis')f['analysis/data_link']=h5py.SoftLink('/data/original')# 使用软链接withh5py.File('example.h5','r')asf:# 通过链接访问数据data_via_link=f['link_to_original'][:]data_original=f['data/original'][:]# 验证是否指向同一数据print(f"数据相同:{np.array_equal(data_via_link,data_original)}")

6.2 硬链接（Hard Links）

withh5py.File('example.h5','w')asf:# 创建数据集data=np.random.randn(100)dset=f.create_dataset('original_data',data=data)# 创建硬链接（多个名称指向同一对象）f['copy1']=dset# 硬链接f['copy2']=dset# 另一个硬链接# 删除原始名称，数据仍然存在delf['original_data']# 通过其他名称仍可访问data_via_copy=f['copy1'][:]

6.3 外部链接（External Links）

# 创建源文件withh5py.File('source.h5','w')asf:f.create_dataset('external_data',data=np.arange(1000))# 创建带外部链接的文件withh5py.File('main.h5','w')asf:# 链接到另一个文件的数据集f['linked_data']=h5py.ExternalLink('source.h5','/external_data')# 链接到另一个文件的组f['linked_group']=h5py.ExternalLink('source.h5','/')# 使用外部链接withh5py.File('main.h5','r')asf:# 自动访问外部文件的数据data=f['linked_data'][:]print(f"从外部文件读取的数据:{data[:10]}")

6.4 对象引用（Object References）

withh5py.File('example.h5','w')asf:# 创建多个数据集dset1=f.create_dataset('dataset_1',data=np.arange(10))dset2=f.create_dataset('dataset_2',data=np.arange(20))dset3=f.create_dataset('dataset_3',data=np.arange(30))# 创建对象引用数组ref_dtype=h5py.ref_dtype refs=np.array([dset1.ref,dset2.ref,dset3.ref],dtype=ref_dtype)f.create_dataset('dataset_references',data=refs)# 使用对象引用withh5py.File('example.h5','r')asf:refs=f['dataset_references'][:]# 通过引用访问对象fori,refinenumerate(refs):dset=f[ref]print(f"引用{i}指向:{dset.name}, 形状:{dset.shape}")data=dset[:]print(f" 数据:{data}")

6.5 区域引用（Region References）

withh5py.File('example.h5','w')asf:# 创建一个大数据集data=np.arange(1000).reshape(100,10)dset=f.create_dataset('large_dataset',data=data)# 创建区域引用# 引用特定的行region1=dset.regionref[0:10,:]# 前10行region2=dset.regionref[50:60,:]# 中间10行# 引用特定的列region3=dset.regionref[:,0:5]# 前5列# 引用特定的矩形区域region4=dset.regionref[20:30,3:7]# 子矩阵# 存储区域引用ref_dtype=h5py.regionref_dtype regions=np.array([region1,region2,region3,region4],dtype=ref_dtype)f.create_dataset('regions',data=regions)# 使用区域引用withh5py.File('example.h5','r')asf:dset=f['large_dataset']regions=f['regions'][:]fori,regioninenumerate(regions):# 通过区域引用读取数据region_data=dset[region]print(f"区域{i}: 形状 ={region_data.shape}")print(f" 数据样本:{region_data.ravel()[:5]}")

7. 压缩和分块

7.1 压缩方法对比

importtime# 创建测试数据test_data=np.random.randn(10000,1000).astype('float32')withh5py.File('compression_test.h5','w')asf:# 无压缩start=time.time()f.create_dataset('no_compression',data=test_data)time_no_comp=time.time()-start# GZIP压缩（级别1-9）forlevelin[1,4,9]:start=time.time()f.create_dataset(f'gzip_level_{level}',data=test_data,compression='gzip',compression_opts=level)time_gzip=time.time()-startprint(f"GZIP级别{level}:{time_gzip:.2f}秒")# LZF压缩start=time.time()f.create_dataset('lzf_compression',data=test_data,compression='lzf')time_lzf=time.time()-startprint(f"LZF:{time_lzf:.2f}秒")# SZIP压缩（需要特殊编译的HDF5）try:f.create_dataset('szip_compression',data=test_data,compression='szip',compression_opts=('nn',16))except:print("SZIP不可用")# 比较文件大小和读取速度importoswithh5py.File('compression_test.h5','r')asf:fornameinf.keys():dset=f[name]# 读取速度测试start=time.time()_=dset[:]read_time=time.time()-start# 获取存储大小storage_size=dset.id.get_storage_size()print(f"{name}:")print(f" 存储大小:{storage_size/1024/1024:.2f}MB")print(f" 读取时间:{read_time:.3f}秒")

7.2 分块策略

withh5py.File('chunking_test.h5','w')asf:data=np.random.randn(10000,10000).astype('float32')# 自动分块f.create_dataset('auto_chunks',data=data,chunks=True)# 按行分块（适合行遍历）f.create_dataset('row_chunks',data=data,chunks=(100,10000))# 100行一块# 按列分块（适合列遍历）f.create_dataset('col_chunks',data=data,chunks=(10000,100))# 100列一块# 方块分块（适合随机访问）f.create_dataset('square_chunks',data=data,chunks=(1000,1000))# 1000x1000的块# 小块分块f.create_dataset('small_chunks',data=data,chunks=(10,10))# 测试不同访问模式的性能withh5py.File('chunking_test.h5','r')asf:fornamein['row_chunks','col_chunks','square_chunks']:dset=f[name]# 行访问测试start=time.time()foriinrange(0,10000,1000):_=dset[i,:]row_time=time.time()-start# 列访问测试start=time.time()forjinrange(0,10000,1000):_=dset[:,j]col_time=time.time()-startprint(f"{name}:")print(f" 行访问:{row_time:.2f}秒")print(f" 列访问:{col_time:.2f}秒")

7.3 最优分块大小计算

defcalculate_optimal_chunk_size(shape,dtype,target_chunk_size_mb=1):""" 计算最优分块大小 参数: shape: 数据集形状 dtype: 数据类型 target_chunk_size_mb: 目标块大小（MB） """element_size=np.dtype(dtype).itemsize target_elements=(target_chunk_size_mb*1024*1024)/element_size# 尝试保持原始形状的比例ndim=len(shape)chunk_shape=list(shape)total_elements=np.prod(shape)iftotal_elements<=target_elements:returntuple(shape)# 缩小各维度scale=(target_elements/total_elements)**(1/ndim)chunk_shape=[max(1,int(dim*scale))fordiminshape]returntuple(chunk_shape)# 使用示例shape=(10000,5000,3)dtype=np.float32 optimal_chunks=calculate_optimal_chunk_size(shape,dtype)print(f"推荐的块大小:{optimal_chunks}")withh5py.File('optimal_chunks.h5','w')asf:data=np.random.randn(*shape).astype(dtype)f.create_dataset('data',data=data,chunks=optimal_chunks,compression='gzip',compression_opts=4)

7.4 Shuffle过滤器

withh5py.File('shuffle_test.h5','w')asf:# 创建具有相关性的数据（更容易压缩）data=np.arange(100000,dtype='float32').reshape(1000,100)data+=np.random.randn(1000,100)*0.1# 不使用shufflef.create_dataset('without_shuffle',data=data,compression='gzip',compression_opts=9,shuffle=False)# 使用shuffle（通常能提高压缩率）f.create_dataset('with_shuffle',data=data,compression='gzip',compression_opts=9,shuffle=True)# 比较压缩效果withh5py.File('shuffle_test.h5','r')asf:size_without=f['without_shuffle'].id.get_storage_size()size_with=f['with_shuffle'].id.get_storage_size()print(f"不使用shuffle:{size_without/1024:.2f}KB")print(f"使用shuffle:{size_with/1024:.2f}KB")print(f"压缩率提升:{(1-size_with/size_without)*100:.1f}%")

8. 可扩展数据集

8.1 一维可扩展数据集

withh5py.File('resizable.h5','w')asf:# 创建可扩展数据集dset=f.create_dataset('expandable_1d',shape=(100,),maxshape=(None,),# 可无限扩展dtype='float32',chunks=(100,))# 初始数据dset[:]=np.random.randn(100)# 追加数据withh5py.File('resizable.h5','a')asf:dset=f['expandable_1d']# 扩展数据集old_size=dset.shape[0]new_data=np.random.randn(50)dset.resize(old_size+50,axis=0)dset[old_size:]=new_dataprint(f"新大小:{dset.shape}")

8.2 多维可扩展数据集

withh5py.File('resizable.h5','w')asf:# 创建2D可扩展数据集dset=f.create_dataset('expandable_2d',shape=(100,50),maxshape=(None,50),# 只在第一维可扩展dtype='float32',chunks=(10,50))dset[:]=np.random.randn(100,50)# 追加行withh5py.File('resizable.h5','a')asf:dset=f['expandable_2d']old_rows=dset.shape[0]new_rows=20dset.resize(old_rows+new_rows,axis=0)dset[old_rows:,:]=np.random.randn(new_rows,50)

8.3 流式数据写入

defstream_data_writer(filename,chunk_size=1000):""" 模拟流式数据写入 """withh5py.File(filename,'w')asf:# 创建可扩展数据集dset=f.create_dataset('streaming_data',shape=(0,100),maxshape=(None,100),chunks=(chunk_size,100),dtype='float32')# 模拟连续数据流foriinrange(10):# 10批数据# 生成新数据new_data=np.random.randn(chunk_size,100)# 扩展并写入old_size=dset.shape[0]dset.resize(old_size+chunk_size,axis=0)dset[old_size:,:]=new_dataprint(f"批次{i+1}: 累计大小 ={dset.shape}")# 使用stream_data_writer('streaming.h5')

8.4 时间序列数据追加

fromdatetimeimportdatetime,timedeltawithh5py.File('timeseries.h5','w')asf:# 创建时间戳数据集dt_type=h5py.string_dtype(encoding='utf-8')timestamps=f.create_dataset('timestamps',shape=(0,),maxshape=(None,),dtype=dt_type,chunks=(1000,))# 创建数值数据集values=f.create_dataset('values',shape=(0,10),maxshape=(None,10),chunks=(1000,10),dtype='float32')# 初始化start_time=datetime.now()foriinrange(5):# 生成新时间戳current_time=start_time+timedelta(seconds=i)timestamp_str=current_time.isoformat()# 生成新数据new_value=np.random.randn(1,10)# 追加old_size=values.shape[0]timestamps.resize(old_size+1,axis=0)values.resize(old_size+1,axis=0)timestamps[old_size]=timestamp_str values[old_size,:]=new_value# 读取时间序列withh5py.File('timeseries.h5','r')asf:ts=f['timestamps'][:]vals=f['values'][:]fort,vinzip(ts[:5],vals[:5]):print(f"{t}:{v}")

9. 维度标签

9.1 创建维度标签

withh5py.File('dimensions.h5','w')asf:# 创建数据集data=np.random.randn(100,64,64,3)dset=f.create_dataset('video',data=data)# 创建维度标签数据集# 维度0: 时间time_scale=f.create_dataset('time',data=np.arange(100))time_scale.attrs['units']='frames'# 维度1和2: 空间坐标y_coords=f.create_dataset('y_coords',data=np.arange(64))x_coords=f.create_dataset('x_coords',data=np.arange(64))# 维度3: 颜色通道channels=f.create_dataset('channels',data=[b'R',b'G',b'B'])# 附加维度标签dset.dims[0].label='time'dset.dims[1].label='y'dset.dims[2].label='x'dset.dims[3].label='channel'# 附加维度标度（dimension scales）dset.dims[0].attach_scale(time_scale)dset.dims[1].attach_scale(y_coords)dset.dims[2].attach_scale(x_coords)dset.dims[3].attach_scale(channels)# 读取维度信息withh5py.File('dimensions.h5','r')asf:dset=f['video']print("维度信息:")fori,diminenumerate(dset.dims):print(f" 维度{i}:{dim.label}")# 获取维度标度iflen(dim)>0:scale=dim[0]print(f" 标度:{scale.name}")print(f" 值:{scale[:5]}...")# 显示前5个

9.2 多个维度标度

withh5py.File('multi_scale.h5','w')asf:# 创建数据集data=np.random.randn(1000,100)dset=f.create_dataset('measurements',data=data)# 为第一维创建多个标度# 标度1: 采样点索引indices=f.create_dataset('sample_indices',data=np.arange(1000))# 标度2: 时间（秒）time_seconds=f.create_dataset('time_seconds',data=np.arange(1000)*0.001)# 标度3: 时间戳dt_type=h5py.string_dtype(encoding='utf-8')timestamps=[]start_time=datetime(2024,1,1,0,0,0)foriinrange(1000):ts=start_time+timedelta(milliseconds=i)timestamps.append(ts.isoformat())f.create_dataset('timestamps',data=timestamps,dtype=dt_type)# 附加所有标度dset.dims[0].attach_scale(indices)dset.dims[0].attach_scale(time_seconds)dset.dims[0].attach_scale(f['timestamps'])# 为第二维创建标度channel_names=[f'Channel_{i}'.encode()foriinrange(100)]channels=f.create_dataset('channel_names',data=channel_names)dset.dims[1].attach_scale(channels)# 使用维度标度withh5py.File('multi_scale.h5','r')asf:dset=f['measurements']print("第一维的标度:")forscaleindset.dims[0]:print(f"{scale.name}:{scale[:3]}...")

10. 完整示例代码

10.1 创建综合示例文件

""" 创建一个包含所有HDF5特性的综合示例文件 """importh5pyimportnumpyasnpfromdatetimeimportdatetimedefcreate_comprehensive_h5(filename='comprehensive.h5'):withh5py.File(filename,'w')asf:# ========================================# 1. 文件级元数据# ========================================f.attrs['title']='Comprehensive HDF5 Example'f.attrs['author']='Josh Witt'f.attrs['created']=datetime.now().isoformat()f.attrs['version']='1.0'f.attrs['description']='Contains all HDF5 data types and features'# ========================================# 2. 基础数值数据# ========================================basic_group=f.create_group('basic_types')# 各种数值类型basic_group.create_dataset('int32',data=np.arange(100,dtype='i4'))basic_group.create_dataset('float64',data=np.random.randn(100))basic_group.create_dataset('complex128',data=np.random.randn(50)+1j*np.random.randn(50))basic_group.create_dataset('bool',data=np.random.rand(100)>0.5)# 多维数组basic_group.create_dataset('matrix_2d',data=np.random.randn(100,50))basic_group.create_dataset('tensor_3d',data=np.random.randn(10,20,30))basic_group.create_dataset('tensor_4d',data=np.random.randn(5,10,20,3))# ========================================# 3. 字符串数据# ========================================string_group=f.create_group('strings')# 固定长度string_group.create_dataset('fixed_ascii',data=np.array([b'hello',b'world'],dtype='S10'))# 变长字符串vlen_str=h5py.string_dtype(encoding='utf-8')string_group.create_dataset('variable_utf8',data=['短','这是一个很长的字符串','中'],dtype=vlen_str)# ========================================# 4. 复合数据类型# ========================================compound_group=f.create_group('compound_types')# 简单结构体person_dt=np.dtype([('name','S50'),('age','i4'),('salary','f8')])person_data=np.array([(b'Alice',25,75000.0),(b'Bob',30,85000.0),(b'Charlie',28,80000.0)],dtype=person_dt)compound_group.create_dataset('people',data=person_data)# 嵌套结构体nested_dt=np.dtype([('id','i4'),('measurements','f4',(5,)),# 固定长度数组('valid','?')])nested_data=np.array([(1,[1.1,2.2,3.3,4.4,5.5],True),(2,[6.6,7.7,8.8,9.9,10.0],False)],dtype=nested_dt)compound_group.create_dataset('nested',data=nested_data)# ========================================# 5. 压缩数据# ========================================compression_group=f.create_group('compressed')test_data=np.random.randn(1000,1000).astype('float32')# 不同压缩方法compression_group.create_dataset('gzip_level_1',data=test_data,compression='gzip',compression_opts=1)compression_group.create_dataset('gzip_level_9',data=test_data,compression='gzip',compression_opts=9,shuffle=True)compression_group.create_dataset('lzf',data=test_data,compression='lzf')# ========================================# 6. 可扩展数据集# ========================================expandable_group=f.create_group('expandable')# 1D可扩展exp_1d=expandable_group.create_dataset('data_1d',shape=(100,),maxshape=(None,),chunks=(100,),dtype='float32')exp_1d[:]=np.random.randn(100)# 2D可扩展exp_2d=expandable_group.create_dataset('data_2d',shape=(100,50),maxshape=(None,50),chunks=(100,50),dtype='float32')exp_2d[:]=np.random.randn(100,50)# ========================================# 7. 引用和链接# ========================================reference_group=f.create_group('references')# 创建被引用的数据target_data=np.arange(100)target=f.create_dataset('target_dataset',data=target_data)# 软链接reference_group['soft_link']=h5py.SoftLink('/target_dataset')# 对象引用ref=target.ref reference_group.create_dataset('object_ref',data=ref)# ========================================# 8. 属性示例# ========================================attr_group=f.create_group('attributes_example')# 数据集with丰富的属性sensor_data=np.random.randn(1000,10)sensor_dset=attr_group.create_dataset('sensor_readings',data=sensor_data)# 各种类型的属性sensor_dset.attrs['units']='meters/second'sensor_dset.attrs['sampling_rate']=1000.0sensor_dset.attrs['calibrated']=Truesensor_dset.attrs['sensor_ids']=[1,2,3,4,5,6,7,8,9,10]sensor_dset.attrs['date_collected']=datetime.now().isoformat()sensor_dset.attrs['location']='Lab Building A, Room 101'sensor_dset.attrs['temperature']=23.5sensor_dset.attrs['humidity']=45.2# ========================================# 9. 维度标签# ========================================dims_group=f.create_group('with_dimensions')# 3D数据with维度标签volume_data=np.random.randn(50,100,100)volume=dims_group.create_dataset('volume',data=volume_data)# 创建维度标度z_coords=dims_group.create_dataset('z',data=np.arange(50))y_coords=dims_group.create_dataset('y',data=np.arange(100))x_coords=dims_group.create_dataset('x',data=np.arange(100))# 附加维度volume.dims[0].label='z'volume.dims[1].label='y'volume.dims[2].label='x'volume.dims[0].attach_scale(z_coords)volume.dims[1].attach_scale(y_coords)volume.dims[2].attach_scale(x_coords)# ========================================# 10. 实际应用示例：神经网络权重# ========================================nn_group=f.create_group('neural_network')# 模拟神经网络层layers={'layer1':{'weights':np.random.randn(784,128),'biases':np.zeros(128)},'layer2':{'weights':np.random.randn(128,64),'biases':np.zeros(64)},'layer3':{'weights':np.random.randn(64,10),'biases':np.zeros(10)}}forlayer_name,paramsinlayers.items():layer_group=nn_group.create_group(layer_name)forparam_name,param_valueinparams.items():dset=layer_group.create_dataset(param_name,data=param_value)dset.attrs['trainable']=Truedset.attrs['dtype']=str(param_value.dtype)nn_group.attrs['architecture']='feedforward'nn_group.attrs['input_size']=784nn_group.attrs['output_size']=10# ========================================# 11. 时间序列数据# ========================================ts_group=f.create_group('time_series')# 生成时间戳num_samples=1000timestamps=[]start_time=datetime(2024,1,1,0,0,0)foriinrange(num_samples):ts=start_time+timedelta(seconds=i)timestamps.append(ts.isoformat())# 存储时间戳dt_type=h5py.string_dtype(encoding='utf-8')ts_group.create_dataset('timestamps',data=timestamps,dtype=dt_type)# 存储对应的数值ts_group.create_dataset('values',data=np.random.randn(num_samples,5))# ========================================# 12. 大数据集with优化# ========================================large_group=f.create_group('large_data')large_data=np.random.randn(10000,1000).astype('float32')large_dset=large_group.create_dataset('optimized',data=large_data,chunks=(1000,100),compression='gzip',compression_opts=4,shuffle=True)large_dset.attrs['chunk_strategy']='optimized for row access'large_dset.attrs['compression_ratio']=f'{large_data.nbytes/large_dset.id.get_storage_size():.2f}x'print(f"文件 '{filename}' 创建完成！")# 创建文件create_comprehensive_h5()

10.2 读取和分析综合示例文件

""" 读取并分析综合示例文件 """defanalyze_h5_file(filename='comprehensive.h5'):print(f"分析文件:{filename}")print("="*80)withh5py.File(filename,'r')asf:# 显示文件属性print("\n【文件属性】")forkey,valueinf.attrs.items():print(f"{key}:{value}")# 递归显示结构print("\n【文件结构】")defprint_tree(name,obj,level=0):indent=" "*levelifisinstance(obj,h5py.Group):print(f"{indent}📁{name}/")# 显示组属性iflen(obj.attrs)>0:forkeyinobj.attrs.keys():print(f"{indent}@{key}:{obj.attrs[key]}")elifisinstance(obj,h5py.Dataset):size_mb=obj.nbytes/(1024*1024)storage_mb=obj.id.get_storage_size()/(1024*1024)compression=obj.compressionor'none'print(f"{indent}📄{name}")print(f"{indent}形状:{obj.shape}, 类型:{obj.dtype}")print(f"{indent}大小:{size_mb:.2f}MB, 存储:{storage_mb:.2f}MB")print(f"{indent}压缩:{compression}")ifobj.chunks:print(f"{indent}分块:{obj.chunks}")# 显示数据集属性iflen(obj.attrs)>0:print(f"{indent}属性:")forkey,valueinobj.attrs.items():print(f"{indent}{key}:{value}")f.visititems(lambdan,o:print_tree(n,o,n.count('/')))# 统计信息print("\n【统计信息】")defcount_items(group):groups=0datasets=0total_size=0defcount(name,obj):nonlocalgroups,datasets,total_sizeifisinstance(obj,h5py.Group):groups+=1elifisinstance(obj,h5py.Dataset):datasets+=1total_size+=obj.nbytes group.visititems(count)returngroups,datasets,total_size num_groups,num_datasets,total_size=count_items(f)file_size=os.path.getsize(filename)print(f" 组数量:{num_groups}")print(f" 数据集数量:{num_datasets}")print(f" 原始数据大小:{total_size/(1024*1024):.2f}MB")print(f" 文件大小:{file_size/(1024*1024):.2f}MB")print(f" 总压缩率:{total_size/file_size:.2f}x")# 示例：读取特定数据print("\n【示例数据读取】")# 读取基础类型if'basic_types/float64'inf:data=f['basic_types/float64'][:10]print(f" float64前10个值:{data}")# 读取结构体if'compound_types/people'inf:people=f['compound_types/people'][:]print(f" 人员记录:")forpersoninpeople:print(f"{person['name'].decode()}: 年龄{person['age']}, 工资${person['salary']}")# 读取神经网络权重if'neural_network'inf:print(f" 神经网络架构:{f['neural_network'].attrs['architecture']}")print(f" 层:")forlayer_nameinf['neural_network'].keys():layer=f['neural_network'][layer_name]weights_shape=layer['weights'].shapeprint(f"{layer_name}:{weights_shape}")# 运行分析if__name__=='__main__':create_comprehensive_h5()analyze_h5_file()

10.3 实用工具函数集

""" HDF5实用工具函数集 """classHDF5Utils:"""HDF5工具类"""@staticmethoddefget_file_info(filename):"""获取文件基本信息"""withh5py.File(filename,'r')asf:info={'filename':filename,'file_size_mb':os.path.getsize(filename)/(1024*1024),'num_groups':0,'num_datasets':0,'total_data_size_mb':0}defcount(name,obj):ifisinstance(obj,h5py.Group):info['num_groups']+=1elifisinstance(obj,h5py.Dataset):info['num_datasets']+=1info['total_data_size_mb']+=obj.nbytes/(1024*1024)f.visititems(count)# 文件属性info['attributes']=dict(f.attrs)returninfo@staticmethoddeffind_large_datasets(filename,threshold_mb=10):"""查找大于阈值的数据集"""large_datasets=[]withh5py.File(filename,'r')asf:defcheck_size(name,obj):ifisinstance(obj,h5py.Dataset):size_mb=obj.nbytes/(1024*1024)ifsize_mb>threshold_mb:large_datasets.append({'name':name,'size_mb':size_mb,'shape':obj.shape,'dtype':str(obj.dtype)})f.visititems(check_size)returnsorted(large_datasets,key=lambdax:x['size_mb'],reverse=True)@staticmethoddefcopy_dataset(src_file,src_path,dst_file,dst_path=None):"""复制数据集到另一个文件"""ifdst_pathisNone:dst_path=src_pathwithh5py.File(src_file,'r')asf_src:withh5py.File(dst_file,'a')asf_dst:f_src.copy(src_path,f_dst,name=dst_path)@staticmethoddefexport_to_dict(filename,path='/'):"""将HDF5导出为嵌套字典"""result={}withh5py.File(filename,'r')asf:defbuild_dict(name,obj):parts=name.split('/')current=resultforpartinparts[:-1]:ifpartnotincurrent:current[part]={}current=current[part]ifisinstance(obj,h5py.Dataset):current[parts[-1]]=obj[:]elifisinstance(obj,h5py.Group):ifparts[-1]notincurrent:current[parts[-1]]={}f.visititems(build_dict)returnresult@staticmethoddefget_compression_stats(filename):"""获取压缩统计信息"""stats=[]withh5py.File(filename,'r')asf:defanalyze_compression(name,obj):ifisinstance(obj,h5py.Dataset):original_size=obj.nbytes storage_size=obj.id.get_storage_size()stats.append({'name':name,'compression':obj.compressionor'none','original_mb':original_size/(1024*1024),'storage_mb':storage_size/(1024*1024),'ratio':original_size/storage_sizeifstorage_size>0else1.0})f.visititems(analyze_compression)returnstats@staticmethoddefvalidate_file(filename):"""验证HDF5文件完整性"""try:withh5py.File(filename,'r')asf:# 尝试访问所有数据集errors=[]defvalidate_dataset(name,obj):ifisinstance(obj,h5py.Dataset):try:# 尝试读取第一个元素ifobj.size>0:_=obj.flat[0]exceptExceptionase:errors.append(f"{name}:{str(e)}")f.visititems(validate_dataset)iferrors:returnFalse,errorselse:returnTrue,["文件验证通过"]exceptExceptionase:returnFalse,[f"无法打开文件:{str(e)}"]# 使用示例if__name__=='__main__':utils=HDF5Utils()# 获取文件信息info=utils.get_file_info('comprehensive.h5')print("文件信息:",info)# 查找大数据集large=utils.find_large_datasets('comprehensive.h5',threshold_mb=1)print("\n大数据集:",large)# 获取压缩统计comp_stats=utils.get_compression_stats('comprehensive.h5')print("\n压缩统计:")forstatincomp_stats[:5]:# 只显示前5个print(f"{stat['name']}:{stat['ratio']:.2f}x ({stat['compression']})")# 验证文件valid,messages=utils.validate_file('comprehensive.h5')print(f"\n文件验证:{'通过'ifvalidelse'失败'}")formsginmessages:print(f"{msg}")

总结

这份指南涵盖了HDF5的所有主要特性：

基础数据类型- 数值、字符串、布尔等
组结构- 层次化组织数据
属性系统- 元数据管理
高级数据类型- 复合类型、枚举、变长数据
引用和链接- 软链接、硬链接、对象引用
压缩和分块- 优化存储和访问
可扩展数据集- 动态增长的数据
维度标签- 为数据添加物理意义
完整示例- 实际应用代码

HDF5完整文件结构与操作指南

目录

1. 完整文件结构概览

1.1 理想的HDF5文件结构

2. 基础数据集类型

2.1 数值类型数据集

2.1.1 整数类型

2.1.2 浮点类型

2.1.3 复数类型

2.1.4 布尔类型

2.2 字符串类型数据集

2.2.1 固定长度字符串

2.2.2 变长字符串

2.3 多维数组

2.3.1 一维数组（向量）

2.3.2 二维数组（矩阵）

2.3.3 三维数组（体数据）

2.3.4 四维及更高维数组

3. 组结构操作

3.1 创建和组织组

3.2 遍历组结构

3.3 移动、复制和删除组

3.4 按条件查找数据集

4. 属性系统

4.1 文件级属性

4.2 组级属性

4.3 数据集级属性

4.4 修改和删除属性

5. 高级数据类型

5.1 复合数据类型（结构体）

5.2 嵌套复合类型

5.3 枚举类型

5.4 变长数据类型

6. 引用和链接

6.1 软链接（Soft Links）

6.2 硬链接（Hard Links）

6.3 外部链接（External Links）

6.4 对象引用（Object References）

6.5 区域引用（Region References）

7. 压缩和分块

7.1 压缩方法对比

7.2 分块策略

7.3 最优分块大小计算

7.4 Shuffle过滤器

8. 可扩展数据集

8.1 一维可扩展数据集

8.2 多维可扩展数据集

8.3 流式数据写入

8.4 时间序列数据追加

9. 维度标签

9.1 创建维度标签

9.2 多个维度标度

10. 完整示例代码

10.1 创建综合示例文件

10.2 读取和分析综合示例文件

10.3 实用工具函数集

总结

鼎捷 易飞ERP 9.3 ISO安装包 下载

【Java毕设源码分享】基于springboot+vue的高校社团管理系统设计与实现(程序+文档+代码讲解+一条龙定制)

LLM应用实践: NoteBookLM初次使用

AWK：一行之诗，一门哲学，一种被遗忘的数据处理圣经

spdlog 库下载，编译，并使用的例子

Flutter与DevEco混合开发：跨端状态同步简易指南

鼎捷易飞ERP 9.3 ISO安装包下载