manifest list列表当中的每一条记录都是一个manifest文件的元数据,描述了manifest文件的一些属性和分区信息。
因此之需要看manifest元数据有哪些信息,以及如何解析即可。
try (CloseableIterable<ManifestFile> files =
Avro.read(manifestList)
.rename("manifest_file", GenericManifestFile.class.getName())
.rename("partitions", GenericPartitionFieldSummary.class.getName())
.rename("r508", GenericPartitionFieldSummary.class.getName())
.classLoader(GenericManifestFile.class.getClassLoader())
.project(ManifestFile.schema())
.reuseContainers(false)
.build()) {
return Lists.newLinkedList(files);
}manifest list文件当中的存储的是manifest_file这个字段,对应的java类是GenericManifestFile。至于schema当然是ManifestFile.schema()返回的schema。
// data fields
private InputFile file = null;
// manifest_path:manifest文件的路径
private String manifestPath = null;
private Long length = null;
private int specId = -1;
private ManifestContent content = ManifestContent.DATA;
private long sequenceNumber = 0;
private long minSequenceNumber = 0;
private Long snapshotId = null;
private Integer addedFilesCount = null;
private Integer existingFilesCount = null;
private Integer deletedFilesCount = null;
private Long addedRowsCount = null;
private Long existingRowsCount = null;
private Long deletedRowsCount = null;
private PartitionFieldSummary[] partitions = null;
private byte[] keyMetadata = null;