Big update from the ruby-ole trunk

git-svn-id: file:///home/svn/framework3/trunk@6522 4d416f70-5f16-0410-b530-b9f4589650da
This commit is contained in:
HD Moore 2009-05-05 20:31:41 +00:00
parent 4bafe57fe3
commit 43a03aa307
11 changed files with 2368 additions and 1150 deletions

View File

@ -1,5 +1,7 @@
module Ole # :nodoc:
require 'ole/support'
Log = Logger.new_with_callstack
end
require 'ole/support'
module Ole # :nodoc:
Log = Logger.new_with_callstack
end

View File

@ -1,169 +1,2 @@
=begin
full file_system module
will be available and recommended usage, allowing Ole::Storage, Dir, and Zip::ZipFile to be
used pretty exchangably down the track. should be possible to write a recursive copy using
the plain api, such that you can copy dirs/files agnostically between any of ole docs, dirs,
and zip files.
i think its okay to have an api like this on top, but there are certain things that ole
does that aren't captured.
ole::storage can have multiple files with the same name, for example, or with / in the
name, and other things that are probably invalid anyway.
i think this should remain an addon, built on top of my core api.
but still the ideas can be reflected in the core, ie, changing the read/write semantics.
once the core changes are complete, this will be a pretty straight forward file to complete.
=end
module Ole
class Storage
def file
@file ||= FileParent.new self
end
def dir
@dir ||= DirParent.new self
end
def dirent_from_path path_str
path = path_str.sub(/^\/*/, '').sub(/\/*$/, '')
dirent = @root
return dirent if path.empty?
path = path.split /\/+/
until path.empty?
raise "invalid path #{path_str.inspect}" if dirent.file?
if tmp = dirent[path.shift]
dirent = tmp
else
# allow write etc later.
raise "invalid path #{path_str.inspect}"
end
end
dirent
end
class FileParent
def initialize ole
@ole = ole
end
def open path_str, mode='r'
dirent = @ole.dirent_from_path path_str
# like Errno::EISDIR
raise "#{path_str.inspect} is a directory" unless dirent.file?
io = dirent.io
if block_given?
yield io
else
io
end
end
alias new :open
def read path
open(path) { |f| f.read }
end
# crappy copy from Dir.
def unlink path
dirent = @ole.dirent_from_path path
# EPERM
raise "operation not permitted #{path.inspect}" unless dirent.file?
# i think we should free all of our blocks. i think the best way to do that would be
# like:
# open(path) { |f| f.truncate 0 }. which should free all our blocks from the
# allocation table. then if we remove ourself from our parent, we won't be part of
# the bat at save time.
# i think if you run repack, all free blocks should get zeroed.
parent = @ole.dirent_from_path(('/' + path).sub(/\/[^\/]+$/, ''))
parent.children.delete dirent
1 # hmmm. as per ::File ?
end
end
class DirParent
def initialize ole
@ole = ole
end
def open path_str
dirent = @ole.dirent_from_path path_str
# like Errno::ENOTDIR
raise "#{path_str.inspect} is not a directory" unless dirent.dir?
dir = Dir.new dirent, path_str
if block_given?
yield dir
else
dir
end
end
# certain Dir class methods proxy in this fashion:
def entries path
open(path) { |dir| dir.entries }
end
# there are some other important ones, like:
# chroot (!), mkdir, chdir, rmdir, glob etc etc. for now, i think
# mkdir, and rmdir are the main ones we'd need to support
def rmdir path
dirent = @ole.dirent_from_path path
# repeating myself
raise "#{path.inspect} is not a directory" unless dirent.dir?
# ENOTEMPTY:
raise "directory not empty #{path.inspect}" unless dirent.children.empty?
# now delete it, how to do that? the canonical representation that is
# maintained is the root tree, and the children array. we must remove it
# from the children array.
# we need the parent then. this sucks but anyway:
parent = @ole.dirent_from_path path.sub(/\/[^\/]+$/, '') || '/'
# note that the way this currently works, on save and repack time this will get
# reflected. to work properly, ie to make a difference now it would have to re-write
# the dirent. i think that Ole::Storage#close will handle that. and maybe include a
# #repack.
parent.children.delete dirent
0 # hmmm. as per ::Dir ?
end
class Dir
include Enumerable
attr_reader :dirent, :path, :entries, :pos
def initialize dirent, path
@dirent, @path = dirent, path
@pos = 0
# FIXME: hack, and probably not really desired
@entries = %w[. ..] + @dirent.children.map(&:name)
end
def each(&block)
@entries.each(&block)
end
def close
end
def read
@entries[@pos]
ensure
@pos += 1 if @pos < @entries.length
end
def pos= pos
@pos = [[0, pos].max, @entries.length].min
end
def rewind
@pos = 0
end
alias tell :pos
alias seek :pos=
end
end
end
end
# keeping this file around for now, but will delete later on...
require 'ole/storage/file_system'

231
lib/ole/ranges_io.rb Normal file
View File

@ -0,0 +1,231 @@
# need IO::Mode
require 'ole/support'
#
# = Introduction
#
# +RangesIO+ is a basic class for wrapping another IO object allowing you to arbitrarily reorder
# slices of the input file by providing a list of ranges. Intended as an initial measure to curb
# inefficiencies in the Dirent#data method just reading all of a file's data in one hit, with
# no method to stream it.
#
# This class will encapuslate the ranges (corresponding to big or small blocks) of any ole file
# and thus allow reading/writing directly to the source bytes, in a streamed fashion (so just
# getting 16 bytes doesn't read the whole thing).
#
# In the simplest case it can be used with a single range to provide a limited io to a section of
# a file.
#
# = Limitations
#
# * No buffering. by design at the moment. Intended for large reads
#
# = TODO
#
# On further reflection, this class is something of a joining/optimization of
# two separate IO classes. a SubfileIO, for providing access to a range within
# a File as a separate IO object, and a ConcatIO, allowing the presentation of
# a bunch of io objects as a single unified whole.
#
# I will need such a ConcatIO if I'm to provide Mime#to_io, a method that will
# convert a whole mime message into an IO stream, that can be read from.
# It will just be the concatenation of a series of IO objects, corresponding to
# headers and boundaries, as StringIO's, and SubfileIO objects, coming from the
# original message proper, or RangesIO as provided by the Attachment#data, that
# will then get wrapped by Mime in a Base64IO or similar, to get encoded on-the-
# fly. Thus the attachment, in its plain or encoded form, and the message as a
# whole never exists as a single string in memory, as it does now. This is a
# fair bit of work to achieve, but generally useful I believe.
#
# This class isn't ole specific, maybe move it to my general ruby stream project.
#
class RangesIO
attr_reader :io, :mode, :ranges, :size, :pos
# +io+:: the parent io object that we are wrapping.
# +mode+:: the mode to use
# +params+:: hash of params.
# * :ranges - byte offsets, either:
# 1. an array of ranges [1..2, 4..5, 6..8] or
# 2. an array of arrays, where the second is length [[1, 1], [4, 1], [6, 2]] for the above
# (think the way String indexing works)
# * :close_parent - boolean to close parent when this object is closed
#
# NOTE: the +ranges+ can overlap.
def initialize io, mode='r', params={}
mode, params = 'r', mode if Hash === mode
ranges = params[:ranges]
@params = {:close_parent => false}.merge params
@mode = IO::Mode.new mode
@io = io
# convert ranges to arrays. check for negative ranges?
ranges ||= [0, io.size]
@ranges = ranges.map { |r| Range === r ? [r.begin, r.end - r.begin] : r }
# calculate size
@size = @ranges.inject(0) { |total, (pos, len)| total + len }
# initial position in the file
@pos = 0
# handle some mode flags
truncate 0 if @mode.truncate?
seek size if @mode.append?
end
#IOError: closed stream
# get this for reading, writing, everything...
#IOError: not opened for writing
# add block form. TODO add test for this
def self.open(*args, &block)
ranges_io = new(*args)
if block_given?
begin; yield ranges_io
ensure; ranges_io.close
end
else
ranges_io
end
end
def pos= pos, whence=IO::SEEK_SET
case whence
when IO::SEEK_SET
when IO::SEEK_CUR
pos += @pos
when IO::SEEK_END
pos = @size + pos
else raise Errno::EINVAL
end
raise Errno::EINVAL unless (0...@size) === pos
@pos = pos
end
alias seek :pos=
alias tell :pos
def close
@io.close if @params[:close_parent]
end
# returns the [+offset+, +size+], pair inorder to read/write at +pos+
# (like a partial range), and its index.
def offset_and_size pos
total = 0
ranges.each_with_index do |(offset, size), i|
if pos <= total + size
diff = pos - total
return [offset + diff, size - diff], i
end
total += size
end
# should be impossible for any valid pos, (0...size) === pos
raise ArgumentError, "no range for pos #{pos.inspect}"
end
def eof?
@pos == @size
end
# read bytes from file, to a maximum of +limit+, or all available if unspecified.
def read limit=nil
data = ''
return data if eof?
limit ||= size
partial_range, i = offset_and_size @pos
# this may be conceptually nice (create sub-range starting where we are), but
# for a large range array its pretty wasteful. even the previous way was. but
# i'm not trying to optimize this atm. it may even go to c later if necessary.
([partial_range] + ranges[i+1..-1]).each do |pos, len|
@io.seek pos
if limit < len
# convoluted, to handle read errors. s may be nil
s = @io.read limit
@pos += s.length if s
break data << s
end
# convoluted, to handle ranges beyond the size of the file
s = @io.read len
@pos += s.length if s
data << s
break if s.length != len
limit -= len
end
data
end
# you may override this call to update @ranges and @size, if applicable.
def truncate size
raise NotImplementedError, 'truncate not supported'
end
# using explicit forward instead of an alias now for overriding.
# should override truncate.
def size= size
truncate size
end
def write data
# short cut. needed because truncate 0 may return no ranges, instead of empty range,
# thus offset_and_size fails.
return 0 if data.empty?
data_pos = 0
# if we don't have room, we can use the truncate hook to make more space.
if data.length > @size - @pos
begin
truncate @pos + data.length
rescue NotImplementedError
raise IOError, "unable to grow #{inspect} to write #{data.length} bytes"
end
end
partial_range, i = offset_and_size @pos
([partial_range] + ranges[i+1..-1]).each do |pos, len|
@io.seek pos
if data_pos + len > data.length
chunk = data[data_pos..-1]
@io.write chunk
@pos += chunk.length
data_pos = data.length
break
end
@io.write data[data_pos, len]
@pos += len
data_pos += len
end
data_pos
end
alias << write
# i can wrap it in a buffered io stream that
# provides gets, and appropriately handle pos,
# truncate. mostly added just to past the tests.
# FIXME
def gets
s = read 1024
i = s.index "\n"
@pos -= s.length - (i+1)
s[0..i]
end
alias readline :gets
def inspect
# the rescue is for empty files
pos, len = (@ranges[offset_and_size(@pos).last] rescue [nil, nil])
range_str = pos ? "#{pos}..#{pos+len}" : 'nil'
"#<#{self.class} io=#{io.inspect}, size=#@size, pos=#@pos, "\
"range=#{range_str}>"
end
end
# this subclass of ranges io explicitly ignores the truncate part of 'w' modes.
# only really needed for the allocation table writes etc. maybe just use explicit modes
# for those
# better yet write a test that breaks before I fix it. added nodoc for the
# time being.
class RangesIONonResizeable < RangesIO # :nodoc:
def initialize io, mode='r', params={}
mode, params = 'r', mode if Hash === mode
flags = IO::Mode.new(mode).flags & ~IO::TRUNC
super io, flags, params
end
end

View File

@ -1,934 +1,3 @@
#! /usr/bin/ruby -w
require 'iconv'
require 'date'
require 'stringio'
require 'tempfile'
require 'ole/base'
require 'ole/types'
require 'ole/io_helpers'
module Ole # :nodoc:
#
# = Introduction
#
# <tt>Ole::Storage</tt> is a simple class intended to abstract away details of the
# access to OLE2 structured storage files, such as those produced by
# Microsoft Office, eg *.doc, *.msg etc.
#
# Initially based on chicago's libole, source available at
# http://prdownloads.sf.net/chicago/ole.tgz
# Later augmented with some corrections by inspecting pole, and (purely
# for header definitions) gsf.
#
# = Usage
#
# Usage should be fairly straight forward:
#
# # get the parent ole storage object
# ole = Ole::Storage.open 'myfile.msg', 'r+'
# # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
# # read some data
# ole.root[1].read 4
# # => "\001\000\376\377"
# # get the top level root object and output a tree structure for
# # debugging
# puts ole.root.to_tree
# # =>
# - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
# |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
# | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
# ...
# |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
# |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
# \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
# |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
# ...
# # write some data, and finish up (note that open is 'r+', so this overwrites
# # but doesn't truncate)
# ole.root["\001CompObj"].open { |f| f.write "blah blah" }
# ole.close
#
# = TODO
#
# 1. tests. lock down how things work at the moment - mostly good.
# create from scratch works now, as does copying in a subtree of another doc, so
# ole embedded attachment serialization works now. i can save embedded xls in an msg
# into a separate file, and open it. this was a goal. now i would want to implemenet
# to_mime conversion for embedded attachments, that serializes them to ole, but handles
# some separately like various meta file types as plain .wmf attachments perhaps. this
# will give pretty good .eml's from emails with embedded attachments.
# the other todo is .rtf output, with full support for embedded ole objects...
# 2. lots of tidying up
# - main FIXME's in this regard are:
# * the custom header cruft for Header and Dirent needs some love.
# * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
# and, in a manner of speaking, but arguably different, Storage itself.
# they have differing api's which would be nice to clean.
# AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
# * ole types need work, can't serialize datetime at the moment.
# 3. need to fix META_BAT support in #flush.
#
class Storage
VERSION = '1.1.1'
# The top of the ole tree structure
attr_reader :root
# The tree structure in its original flattened form. only valid after #load, or #flush.
attr_reader :dirents
# The underlying io object to/from which the ole object is serialized, whether we
# should close it, and whether it is writeable
attr_reader :io, :close_parent, :writeable
# Low level internals, you probably shouldn't need to mess with these
attr_reader :header, :bbat, :sbat, :sb_file
# maybe include an option hash, and allow :close_parent => true, to be more general.
# +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
def initialize arg, mode=nil
# get the io object
@close_parent, @io = if String === arg
[true, open(arg, mode || 'rb')]
else
raise 'unable to specify mode string with io object' if mode
[false, arg]
end
# do we have this file opened for writing? don't know of a better way to tell
@writeable = begin
@io.flush
true
rescue IOError
false
end
# silence undefined warning in clear
@sb_file = nil
# if the io object has data, we should load it, otherwise start afresh
if @io.size > 0; load
else clear
end
end
def self.new arg, mode=nil
ole = super
if block_given?
begin yield ole
ensure; ole.close
end
else ole
end
end
class << self
# encouraged
alias open :new
# deprecated
alias load :new
end
# load document from file.
def load
# we always read 512 for the header block. if the block size ends up being different,
# what happens to the 109 fat entries. are there more/less entries?
@io.rewind
header_block = @io.read 512
@header = Header.load header_block
# create an empty bbat
@bbat = AllocationTable::Big.new self
# extra mbat blocks
mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'L*'
# am i using num_bat in the right way?
@bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
# get block chain for directories, read it, then split it into chunks and load the
# directory entries. semantics changed - used to cut at first dir where dir.type == 0
@dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
map { |str| Dirent.load self, str }.reject { |d| d.type_id == 0 }
# now reorder from flat into a tree
# links are stored in some kind of balanced binary tree
# check that everything is visited at least, and at most once
# similarly with the blocks of the file.
# was thinking of moving this to Dirent.to_tree instead.
class << @dirents
def to_tree idx=0
return [] if idx == Dirent::EOT
d = self[idx]
d.children = to_tree d.child
raise "directory #{d.inspect} used twice" if d.idx
d.idx = idx
to_tree(d.prev) + [d] + to_tree(d.next)
end
end
@root = @dirents.to_tree.first
Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
unused = @dirents.reject(&:idx).length
Log.warn "* #{unused} unused directories" if unused > 0
# FIXME i don't currently use @header.num_sbat which i should
# hmm. nor do i write it. it means what exactly again?
@sb_file = RangesIOResizeable.new @bbat, @root.first_block, @root.size
@sbat = AllocationTable::Small.new self
@sbat.load @bbat.read(@header.sbat_start)
end
def close
flush if @writeable
@sb_file.close
@io.close if @close_parent
end
# should have a #open_dirent i think. and use it in load and flush. neater.
# also was thinking about Dirent#open_padding. then i can more easily clean up the padding
# to be 0.chr
=begin
thoughts on fixes:
1. reterminate any chain not ending in EOC.
2. pass through all chain heads looking for collisions, and making sure nothing points to them
(ie they are really heads).
3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
in the bat for them.
this stuff will ensure reliability of input better. otherwise, its actually worth doing a repack
directly after read, to ensure the above is probably acounted for, before subsequent writes possibly
destroy things.
=end
def flush
# recreate dirs from our tree, split into dirs and big and small files
@root.type = :root
# for now.
@root.name = 'Root Entry'
@root.first_block = @sb_file.first_block
@root.size = @sb_file.size
@dirents = @root.flatten
#dirs, files = @dirents.partition(&:dir?)
#big_files, small_files = files.partition { |file| file.size > @header.threshold }
# maybe i should move the block form up to RangesIO, and get it for free at all levels.
# Dirent#open gets block form for free then
io = RangesIOResizeable.new @bbat, @header.dirent_start
io.truncate 0
@dirents.each { |dirent| io.write dirent.save }
padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
#p [:padding, padding]
io.write 0.chr * padding
@header.dirent_start = io.first_block
io.close
# similarly for the sbat data.
io = RangesIOResizeable.new @bbat, @header.sbat_start
io.truncate 0
io.write @sbat.save
@header.sbat_start = io.first_block
@header.num_sbat = @bbat.chain(@header.sbat_start).length
io.close
# what follows will be slightly more complex for the bat fiddling.
# create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
# truncate. then when its time to write, convert that chain and some chunk of blocks at
# the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
# done.
@bbat.table.map! do |b|
b == AllocationTable::BAT || b == AllocationTable::META_BAT ?
AllocationTable::AVAIL : b
end
io = RangesIOResizeable.new @bbat, AllocationTable::EOC
# use crappy loop for now:
while true
bbat_data = @bbat.save
#mbat_data = bbat_data.length / @bbat.block_size * 4
mbat_chain = @bbat.chain io.first_block
raise NotImplementedError, "don't handle writing out extra META_BAT blocks yet" if mbat_chain.length > 109
# so we can ignore meta blocks in this calculation:
break if io.size >= bbat_data.length # it shouldn't be bigger right?
# this may grow the bbat, depending on existing available blocks
io.truncate bbat_data.length
end
# now extract the info we want:
ranges = io.ranges
mbat_chain = @bbat.chain io.first_block
io.close
mbat_chain.each { |b| @bbat.table[b] = AllocationTable::BAT }
@header.num_bat = mbat_chain.length
#p @bbat.truncated_table
#p ranges
#p mbat_chain
# not resizeable!
io = RangesIO.new @io, ranges
io.write @bbat.save
io.close
mbat_chain += [AllocationTable::AVAIL] * (109 - mbat_chain.length)
@header.mbat_start = AllocationTable::EOC
@header.num_mbat = 0
=begin
bbat_data = new_bbat.save
# must exist as linear chain stored in header.
@header.num_bat = (bbat_data.length / new_bbat.block_size.to_f).ceil
base = io.pos / new_bbat.block_size - 1
io.write bbat_data
# now that spanned a number of blocks:
mbat = (0...@header.num_bat).map { |i| i + base }
mbat += [AllocationTable::AVAIL] * (109 - mbat.length) if mbat.length < 109
header_mbat = mbat[0...109]
other_mbat_data = mbat[109..-1].pack 'L*'
@header.mbat_start = base + @header.num_bat
@header.num_mbat = (other_mbat_data.length / new_bbat.block_size.to_f).ceil
io.write other_mbat_data
=end
@root.type = :dir
# now seek back and write the header out
@io.seek 0
@io.write @header.save + mbat_chain.pack('L*')
@io.flush
end
def clear
# first step though is to support modifying pre-existing and saving, then this
# missing gap will be fairly straight forward - essentially initialize to
# equivalent of loading an empty ole document.
#raise NotImplementedError, 'unable to create new ole objects from scratch as yet'
Log.warn 'creating new ole storage object on non-writable io' unless @writeable
@header = Header.new
@bbat = AllocationTable::Big.new self
@root = Dirent.new self, :dir
@root.name = 'Root Entry'
@dirents = [@root]
@root.idx = 0
@root.children = []
# size shouldn't display for non-files
@root.size = 0
@sb_file.close if @sb_file
@sb_file = RangesIOResizeable.new @bbat, AllocationTable::EOC
@sbat = AllocationTable::Small.new self
# throw everything else the hell away
@io.truncate 0
end
# could be useful with mis-behaving ole documents. or to just clean them up.
def repack temp=:file
case temp
when :file; Tempfile.open 'w+', &method(:repack_using_io)
when :mem; StringIO.open(&method(:repack_using_io))
else raise "unknown temp backing #{temp.inspect}"
end
end
def repack_using_io temp_io
@io.rewind
IO.copy @io, temp_io
clear
Storage.open temp_io do |temp_ole|
temp_ole.root.type = :dir
Dirent.copy temp_ole.root, root
end
end
def bat_for_size size
# note >=, not > previously.
size >= @header.threshold ? @bbat : @sbat
end
def inspect
"#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
end
# A class which wraps the ole header
class Header < Struct.new(
:magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
:reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
:sbat_start, :num_sbat, :mbat_start, :num_mbat
)
PACK = 'a8 a16 S2 a2 S2 a6 L3 a4 L5'
SIZE = 0x4c
# i have seen it pointed out that the first 4 bytes of hex,
# 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
# what you get if creating new header from scratch.
# AllocationTable::EOC isn't available yet. meh.
EOC = 0xfffffffe
DEFAULT = [
MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
0.chr * 6, 0, 1, EOC, 0.chr * 4,
4096, EOC, 0, EOC, 0
]
# 2 basic initializations, from scratch, or from a data string.
# from scratch will be geared towards creating a new ole object
def initialize *values
super(*(values.empty? ? DEFAULT : values))
validate!
end
def self.load str
Header.new(*str.unpack(PACK))
end
def save
to_a.pack PACK
end
def validate!
raise "OLE2 signature is invalid" unless magic == MAGIC
if num_bat == 0 or # is that valid for a completely empty file?
# not sure about this one. basically to do max possible bat given size of mbat
num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
# shouldn't need to use the mbat as there is enough space in the header block
num_bat < 109 && num_mbat != 0 or
# given the size of the header is 76, if b_shift <= 6, blocks address the header.
s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
# we only handle little endian
byte_order != "\xfe\xff"
raise "not valid OLE2 structured storage file"
end
# relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
# 3 for this value.
# transacting_signature != "\x00" * 4 or
if threshold != 4096 or
num_mbat == 0 && mbat_start != AllocationTable::EOC or
reserved != "\x00" * 6
Log.warn "may not be a valid OLE2 structured storage file"
end
true
end
end
#
# +AllocationTable+'s hold the chains corresponding to files. Given
# an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
# the blocks that make up that file.
#
# There are 2 allocation tables, the bbat, and sbat, for big and small
# blocks respectively. The block chain should be loaded using either
# <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
# as appropriate.
#
# Whether or not big or small blocks are used for a file depends on
# whether its size is over the <tt>Header#threshold</tt> level.
#
# An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
# which are stored in blocks throughout the file. The blocks are either
# big or small, and are accessed using the <tt>AllocationTable</tt>.
#
# The bbat allocation table's data is stored in the spare room in the header
# block, and in extra blocks throughout the file as referenced by the meta
# bat. That chain is linear, as there is no higher level table.
#
class AllocationTable
# a free block (I don't currently leave any blocks free), although I do pad out
# the allocation table with AVAIL to the block size.
AVAIL = 0xffffffff
EOC = 0xfffffffe # end of a chain
# these blocks correspond to the bat, and aren't part of a file, nor available.
# (I don't currently output these)
BAT = 0xfffffffd
META_BAT = 0xfffffffc
attr_reader :ole, :io, :table, :block_size
def initialize ole
@ole = ole
@table = []
end
def load data
@table = data.unpack('L*')
end
def truncated_table
# this strips trailing AVAILs. come to think of it, this has the potential to break
# bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
# very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
# at load time.
temp = @table.reverse
not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
temp.reverse
end
def save
table = truncated_table #@table
# pad it out some
num = @ole.bbat.block_size / 4
# do you really use AVAIL? they probably extend past end of file, and may shortly
# be used for the bat. not really good.
table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
table.pack 'L*'
end
# rewriting this to be non-recursive. it broke on a large attachment
# building up the chain, causing a stack error. need tail-call elimination...
def chain start
a = []
idx = start
until idx >= META_BAT
raise "broken allocationtable chain" if idx < 0 || idx > @table.length
a << idx
idx = @table[idx]
end
Log.warn "invalid chain terminator #{idx}" unless idx == EOC
a
end
def ranges chain, size=nil
chain = self.chain(chain) unless Array === chain
blocks_to_ranges chain, size
end
# Turn a chain (an array given by +chain+) of big blocks, optionally
# truncated to +size+, into an array of arrays describing the stretches of
# bytes in the file that it belongs to.
#
# Big blocks are of size Ole::Storage::Header#b_size, and are stored
# directly in the parent file.
# truncate the chain if required
# convert chain to ranges of the block size
# truncate final range if required
def blocks_to_ranges chain, size=nil
chain = chain[0...(size.to_f / block_size).ceil] if size
ranges = chain.map { |i| [block_size * i, block_size] }
ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
ranges
end
# quick shortcut. chain can be either a head (in which case the table is used to
# turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
# its not resizeable or migrateable. it probably could be resizeable though, using
# self as the bat. but what would the first_block be?
def open chain, size=nil
io = RangesIO.new @io, ranges(chain, size)
if block_given?
begin yield io
ensure; io.close
end
else io
end
end
def read chain, size=nil
open chain, size, &:read
end
# ----------------------
def get_free_block
@table.each_index { |i| return i if @table[i] == AVAIL }
@table.push AVAIL
@table.length - 1
end
# must return first_block
def resize_chain first_block, size
new_num_blocks = (size / block_size.to_f).ceil
blocks = chain first_block
old_num_blocks = blocks.length
if new_num_blocks < old_num_blocks
# de-allocate some of our old blocks. TODO maybe zero them out in the file???
(new_num_blocks...old_num_blocks).each { |i| @table[blocks[i]] = AVAIL }
# if we have a chain, terminate it and return head, otherwise return EOC
if new_num_blocks > 0
@table[blocks[new_num_blocks-1]] = EOC
first_block
else EOC
end
elsif new_num_blocks > old_num_blocks
# need some more blocks.
last_block = blocks.last
(new_num_blocks - old_num_blocks).times do
block = get_free_block
# connect the chain. handle corner case of blocks being [] initially
if last_block
@table[last_block] = block
else
first_block = block
end
last_block = block
# this is just to inhibit the problem where it gets picked as being a free block
# again next time around.
@table[last_block] = EOC
end
first_block
else first_block
end
end
class Big < AllocationTable
def initialize(*args)
super
@block_size = 1 << @ole.header.b_shift
@io = @ole.io
end
# Big blocks are kind of -1 based, in order to not clash with the header.
def blocks_to_ranges blocks, size
super blocks.map { |b| b + 1 }, size
end
end
class Small < AllocationTable
def initialize(*args)
super
@block_size = 1 << @ole.header.s_shift
@io = @ole.sb_file
end
end
end
# like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
# AllocationTable, and can be resized. used for read/write to 2 streams:
# 1. serialized dirent data
# 2. sbat table data
# 3. all dirents but through RangesIOMigrateable below
#
# Note that all internal access to first_block is through accessors, as it is sometimes
# useful to redirect it.
class RangesIOResizeable < RangesIO
attr_reader :bat
attr_accessor :first_block
def initialize bat, first_block, size=nil
@bat = bat
self.first_block = first_block
super @bat.io, @bat.ranges(first_block, size)
end
def truncate size
# note that old_blocks is != @ranges.length necessarily. i'm planning to write a
# merge_ranges function that merges sequential ranges into one as an optimization.
self.first_block = @bat.resize_chain first_block, size
@ranges = @bat.ranges first_block, size
@pos = @size if @pos > size
# don't know if this is required, but we explicitly request our @io to grow if necessary
# we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
# can be made.
# maybe its ok to just seek out there later??
max = @ranges.map { |pos, len| pos + len }.max || 0
@io.truncate max if max > @io.size
@size = size
end
end
# like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
# between bats based on size, and updating the dirent, instead of the ole copy back
# on close.
class RangesIOMigrateable < RangesIOResizeable
attr_reader :dirent
def initialize dirent
@dirent = dirent
super @dirent.ole.bat_for_size(@dirent.size), @dirent.first_block, @dirent.size
end
def truncate size
bat = @dirent.ole.bat_for_size size
if bat != @bat
# bat migration needed! we need to backup some data. the amount of data
# should be <= @ole.header.threshold, so we can just hold it all in one buffer.
# backup this
pos = @pos
@pos = 0
keep = read [@size, size].min
# this does a normal truncate to 0, removing our presence from the old bat, and
# rewrite the dirent's first_block
super 0
@bat = bat
# just change the underlying io from right under everyone :)
@io = bat.io
# important to do this now, before the write. as the below write will always
# migrate us back to sbat! this will now allocate us +size+ in the new bat.
super
@pos = 0
write keep
@pos = pos
else
super
end
# now just update the file
@dirent.size = size
end
# forward this to the dirent
def first_block
@dirent.first_block
end
def first_block= val
@dirent.first_block = val
end
end
#
# A class which wraps an ole directory entry. Can be either a directory
# (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
#
# Most interaction with <tt>Ole::Storage</tt> is through this class.
# The 2 most important functions are <tt>Dirent#children</tt>, and
# <tt>Dirent#data</tt>.
#
# was considering separate classes for dirs and files. some methods/attrs only
# applicable to one or the other.
class Dirent
MEMBERS = [
:name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
:clsid, :flags, # dirs only
:create_time_str, :modify_time_str, # files only
:first_block, :size, :reserved
]
PACK = 'a64 S C C L3 a16 L a8 a8 L2 a4'
SIZE = 128
EPOCH = DateTime.parse '1601-01-01'
TYPE_MAP = {
# this is temporary
0 => :empty,
1 => :dir,
2 => :file,
5 => :root
}
COLOUR_MAP = {
0 => :red,
1 => :black
}
# used in the next / prev / child stuff to show that the tree ends here.
# also used for first_block for directory.
EOT = 0xffffffff
# All +Dirent+ names are in UTF16, which we convert
FROM_UTF16 = Iconv.new 'utf-8', 'utf-16le'
TO_UTF16 = Iconv.new 'utf-16le', 'utf-8'
include Enumerable
attr_accessor :values
# Dirent's should be created in 1 of 2 ways, either Dirent.new ole, [:dir/:file/:root],
# or Dirent.load '... dirent data ...'
# its a bit clunky, but thats how it is at the moment. you can assign to type, but
# shouldn't.
attr_accessor :idx
# This returns all the children of this +Dirent+. It is filled in
# when the tree structure is recreated.
attr_accessor :children
attr_reader :ole, :type, :create_time, :modify_time, :name
def initialize ole, type
@ole = ole
# this isn't really good enough. need default values put in there.
@values = [
0.chr * 2, 2, 0, # will get overwritten
1, EOT, EOT, EOT,
0.chr * 16, 0, nil, nil,
AllocationTable::EOC, 0, 0.chr * 4]
# maybe check types here.
@type = type
@create_time = @modify_time = nil
@children = []
if file?
@create_time = Time.now
@modify_time = Time.now
end
end
def self.load ole, str
# load should function without the need for the initializer.
dirent = Dirent.allocate
dirent.load ole, str
dirent
end
def load ole, str
@ole = ole
@values = str.unpack PACK
@name = FROM_UTF16.iconv name_utf16[0...name_len].sub(/\x00\x00$/, '')
@type = TYPE_MAP[type_id] or raise "unknown type #{type_id.inspect}"
if file?
@create_time = Types.load_time create_time_str
@modify_time = Types.load_time modify_time_str
end
end
# only defined for files really. and the above children stuff is only for children.
# maybe i should have some sort of File and Dir class, that subclass Dirents? a dirent
# is just a data holder.
# this can be used for write support if the underlying io object was opened for writing.
# maybe take a mode string argument, and do truncation, append etc stuff.
def open
return nil unless file?
io = RangesIOMigrateable.new self
if block_given?
begin yield io
ensure; io.close
end
else io
end
end
def read limit=nil
open { |io| io.read limit }
end
def dir?
# to count root as a dir.
type != :file
end
def file?
type == :file
end
def time
# time is nil for streams, otherwise try to parse either of the time pairse (not
# sure of their meaning - created / modified?)
#@time ||= file? ? nil : (Dirent.parse_time(secs1, days1) || Dirent.parse_time(secs2, days2))
create_time || modify_time
end
def each(&block)
@children.each(&block)
end
def [] idx
return children[idx] if Integer === idx
# path style look up.
# maybe take another arg to allow creation? or leave that to the filesystem
# add on.
# not sure if '/' is a valid char in an Dirent#name, so no splitting etc at
# this level.
# also what about warning about multiple hits for the same name?
children.find { |child| idx === child.name }
end
# solution for the above '/' thing for now.
def / path
self[path]
end
def to_tree
if children and !children.empty?
str = "- #{inspect}\n"
children.each_with_index do |child, i|
last = i == children.length - 1
child.to_tree.split(/\n/).each_with_index do |line, j|
str << " #{last ? (j == 0 ? "\\" : ' ') : '|'}#{line}\n"
end
end
str
else "- #{inspect}\n"
end
end
MEMBERS.each_with_index do |sym, i|
define_method(sym) { @values[i] }
define_method(sym.to_s + '=') { |val| @values[i] = val }
end
def to_a
@values
end
# flattens the tree starting from here into +dirents+. note it modifies its argument.
def flatten dirents=[]
@idx = dirents.length
dirents << self
children.each { |child| child.flatten dirents }
self.child = Dirent.flatten_helper children
dirents
end
# i think making the tree structure optimized is actually more complex than this, and
# requires some intelligent ordering of the children based on names, but as long as
# it is valid its ok.
# actually, i think its ok. gsf for example only outputs a singly-linked-list, where
# prev is always EOT.
def self.flatten_helper children
return EOT if children.empty?
i = children.length / 2
this = children[i]
this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
this.idx
end
attr_accessor :name, :type
def save
tmp = TO_UTF16.iconv(name)
tmp = tmp[0, 62] if tmp.length > 62
tmp += 0.chr * 2
self.name_len = tmp.length
self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
begin
self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
rescue
raise "unknown type #{type.inspect}"
end
# for the case of files, it is assumed that that was handled already
# note not dir?, so as not to override root's first_block
self.first_block = Dirent::EOT if type == :dir
if 0 #file?
#self.create_time_str = ?? #Types.load_time create_time_str
#self.modify_time_str = ?? #Types.load_time modify_time_str
else
self.create_time_str = 0.chr * 8
self.modify_time_str = 0.chr * 8
end
@values.pack PACK
end
def inspect
str = "#<Dirent:#{name.inspect}"
# perhaps i should remove the data snippet. its not that useful anymore.
if file?
tmp = read 9
data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
str << " size=#{size}" +
"#{time ? ' time=' + time.to_s.inspect : nil}" +
" data=#{data.inspect}"
else
# there is some dir specific stuff. like clsid, flags.
end
str + '>'
end
# --------
# and for creation of a dirent. don't like the name. is it a file or a directory?
# assign to type later? io will be empty.
def new_child type
child = Dirent.new ole, type
children << child
yield child if block_given?
child
end
def delete child
# remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
raise "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
# free our blocks
child.open { |io| io.truncate 0 }
end
def self.copy src, dst
# copies the contents of src to dst. must be the same type. this will throw an
# error on copying to root. maybe this will recurse too much for big documents??
raise unless src.type == dst.type
dst.name = src.name
if src.dir?
src.children.each do |src_child|
dst.new_child(src_child.type) { |dst_child| Dirent.copy src_child, dst_child }
end
else
src.open do |src_io|
dst.open { |dst_io| IO.copy src_io, dst_io }
end
end
end
end
end
end
if $0 == __FILE__
puts Ole::Storage.open(ARGV[0]) { |ole| ole.root.to_tree }
end
require 'ole/storage/base'
require 'ole/storage/file_system'
require 'ole/storage/meta_data'

916
lib/ole/storage/base.rb Executable file
View File

@ -0,0 +1,916 @@
require 'tempfile'
require 'ole/base'
require 'ole/types'
require 'ole/ranges_io'
module Ole # :nodoc:
#
# This class is the primary way the user interacts with an OLE storage file.
#
# = TODO
#
# * the custom header cruft for Header and Dirent needs some love.
# * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
# and, in a manner of speaking, but arguably different, Storage itself.
# they have differing api's which would be nice to rethink.
# AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
#
class Storage
# thrown for any bogus OLE file errors.
class FormatError < StandardError # :nodoc:
end
VERSION = '1.2.8.2'
# options used at creation time
attr_reader :params
# The top of the ole tree structure
attr_reader :root
# The tree structure in its original flattened form. only valid after #load, or #flush.
attr_reader :dirents
# The underlying io object to/from which the ole object is serialized, whether we
# should close it, and whether it is writeable
attr_reader :io, :close_parent, :writeable
# Low level internals, you probably shouldn't need to mess with these
attr_reader :header, :bbat, :sbat, :sb_file
# +arg+ should be either a filename, or an +IO+ object, and needs to be seekable.
# +mode+ is optional, and should be a regular mode string.
def initialize arg, mode=nil, params={}
params, mode = mode, nil if Hash === mode
params = {:update_timestamps => true}.merge(params)
@params = params
# get the io object
@close_parent, @io = if String === arg
mode ||= 'rb'
[true, open(arg, mode)]
else
raise ArgumentError, 'unable to specify mode string with io object' if mode
[false, arg]
end
# do we have this file opened for writing? don't know of a better way to tell
# (unless we parse the mode string in the open case)
# hmmm, note that in ruby 1.9 this doesn't work anymore. which is all the more
# reason to use mode string parsing when available, and fall back to something like
# io.writeable? otherwise.
@writeable = begin
if mode
IO::Mode.new(mode).writeable?
else
@io.flush
# this is for the benefit of ruby-1.9
@io.syswrite('') if @io.respond_to?(:syswrite)
true
end
rescue IOError
false
end
# silence undefined warning in clear
@sb_file = nil
# if the io object has data, we should load it, otherwise start afresh
# this should be based on the mode string rather.
@io.size > 0 ? load : clear
end
# somewhat similar to File.open, the open class method allows a block form where
# the Ole::Storage object is automatically closed on completion of the block.
def self.open arg, mode=nil, params={}
ole = new arg, mode, params
if block_given?
begin yield ole
ensure; ole.close
end
else ole
end
end
# load document from file.
#
# TODO: implement various allocationtable checks, maybe as a AllocationTable#fsck function :)
#
# 1. reterminate any chain not ending in EOC.
# compare file size with actually allocated blocks per file.
# 2. pass through all chain heads looking for collisions, and making sure nothing points to them
# (ie they are really heads). in both sbat and mbat
# 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
# in the bat for them.
# 4. maybe a check of excess data. if there is data outside the bbat.truncate.length + 1 * block_size,
# (eg what is used for truncate in #flush), then maybe add some sort of message about that. it
# will be automatically thrown away at close time.
def load
# we always read 512 for the header block. if the block size ends up being different,
# what happens to the 109 fat entries. are there more/less entries?
@io.rewind
header_block = @io.read 512
@header = Header.new header_block
# create an empty bbat.
@bbat = AllocationTable::Big.new self
bbat_chain = header_block[Header::SIZE..-1].unpack 'V*'
mbat_block = @header.mbat_start
@header.num_mbat.times do
blocks = @bbat.read([mbat_block]).unpack 'V*'
mbat_block = blocks.pop
bbat_chain += blocks
end
# am i using num_bat in the right way?
@bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
# get block chain for directories, read it, then split it into chunks and load the
# directory entries. semantics changed - used to cut at first dir where dir.type == 0
@dirents = @bbat.read(@header.dirent_start).to_enum(:each_chunk, Dirent::SIZE).
map { |str| Dirent.new self, str }.reject { |d| d.type_id == 0 }
# now reorder from flat into a tree
# links are stored in some kind of balanced binary tree
# check that everything is visited at least, and at most once
# similarly with the blocks of the file.
# was thinking of moving this to Dirent.to_tree instead.
class << @dirents
def to_tree idx=0
return [] if idx == Dirent::EOT
d = self[idx]
d.children = to_tree d.child
raise FormatError, "directory #{d.inspect} used twice" if d.idx
d.idx = idx
to_tree(d.prev) + [d] + to_tree(d.next)
end
end
@root = @dirents.to_tree.first
Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
unused = @dirents.reject(&:idx).length
Log.warn "#{unused} unused directories" if unused > 0
# FIXME i don't currently use @header.num_sbat which i should
# hmm. nor do i write it. it means what exactly again?
# which mode to use here?
@sb_file = RangesIOResizeable.new @bbat, :first_block => @root.first_block, :size => @root.size
@sbat = AllocationTable::Small.new self
@sbat.load @bbat.read(@header.sbat_start)
end
def close
@sb_file.close
flush if @writeable
@io.close if @close_parent
end
# the flush method is the main "save" method. all file contents are always
# written directly to the file by the RangesIO objects, all this method does
# is write out all the file meta data - dirents, allocation tables, file header
# etc.
#
# maybe add an option to zero the padding, and any remaining avail blocks in the
# allocation table.
#
# TODO: long and overly complex. simplify and test better. eg, perhaps move serialization
# of bbat to AllocationTable::Big.
def flush
# update root dirent, and flatten dirent tree
@root.name = 'Root Entry'
@root.first_block = @sb_file.first_block
@root.size = @sb_file.size
@dirents = @root.flatten
# serialize the dirents using the bbat
RangesIOResizeable.open @bbat, 'w', :first_block => @header.dirent_start do |io|
@dirents.each { |dirent| io.write dirent.to_s }
padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
io.write 0.chr * padding
@header.dirent_start = io.first_block
end
# serialize the sbat
# perhaps the blocks used by the sbat should be marked with BAT?
RangesIOResizeable.open @bbat, 'w', :first_block => @header.sbat_start do |io|
io.write @sbat.to_s
@header.sbat_start = io.first_block
@header.num_sbat = @bbat.chain(@header.sbat_start).length
end
# create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
# truncate. then when its time to write, convert that chain and some chunk of blocks at
# the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
# done.
# this is perhaps not good, as we reclaim all bat blocks here, which
# may include the sbat we just wrote. FIXME
@bbat.map! do |b|
b == AllocationTable::BAT || b == AllocationTable::META_BAT ? AllocationTable::AVAIL : b
end
# currently we use a loop. this could be better, but basically,
# the act of writing out the bat, itself requires blocks which get
# recorded in the bat.
#
# i'm sure that there'd be some simpler closed form solution to this. solve
# recursive func:
#
# num_mbat_blocks = ceil(max((mbat_len - 109) * 4 / block_size, 0))
# bbat_len = initial_bbat_len + num_mbat_blocks
# mbat_len = ceil(bbat_len * 4 / block_size)
#
# the actual bbat allocation table is itself stored throughout the file, and that chain
# is stored in the initial blocks, and the mbat blocks.
num_mbat_blocks = 0
io = RangesIOResizeable.new @bbat, 'w', :first_block => AllocationTable::EOC
# truncate now, so that we can simplify size calcs - the mbat blocks will be appended in a
# contiguous chunk at the end.
# hmmm, i think this truncate should be matched with a truncate of the underlying io. if you
# delete a lot of stuff, and free up trailing blocks, the file size never shrinks. this can
# be fixed easily, add an io truncate
@bbat.truncate!
before = @io.size
@io.truncate @bbat.block_size * (@bbat.length + 1)
while true
# get total bbat size. equivalent to @bbat.to_s.length, but for the factoring in of
# the mbat blocks. we can't just add the mbat blocks directly to the bbat, as as this iteration
# progresses, more blocks may be needed for the bat itself (if there are no more gaps), and the
# mbat must remain contiguous.
bbat_data_len = ((@bbat.length + num_mbat_blocks) * 4 / @bbat.block_size.to_f).ceil * @bbat.block_size
# now storing the excess mbat blocks also increases the size of the bbat:
new_num_mbat_blocks = ([bbat_data_len / @bbat.block_size - 109, 0].max * 4 / (@bbat.block_size.to_f - 4)).ceil
if new_num_mbat_blocks != num_mbat_blocks
# need more space for the mbat.
num_mbat_blocks = new_num_mbat_blocks
elsif io.size != bbat_data_len
# need more space for the bat
# this may grow the bbat, depending on existing available blocks
io.truncate bbat_data_len
else
break
end
end
# now extract the info we want:
ranges = io.ranges
bbat_chain = @bbat.chain io.first_block
io.close
bbat_chain.each { |b| @bbat[b] = AllocationTable::BAT }
# tack on the mbat stuff
@header.num_bat = bbat_chain.length
mbat_blocks = (0...num_mbat_blocks).map do
block = @bbat.free_block
@bbat[block] = AllocationTable::META_BAT
block
end
@header.mbat_start = mbat_blocks.first || AllocationTable::EOC
# now finally write the bbat, using a not resizable io.
# the mode here will be 'r', which allows write atm.
RangesIO.open(@io, :ranges => ranges) { |f| f.write @bbat.to_s }
# this is the mbat. pad it out.
bbat_chain += [AllocationTable::AVAIL] * [109 - bbat_chain.length, 0].max
@header.num_mbat = num_mbat_blocks
if num_mbat_blocks != 0
# write out the mbat blocks now. first of all, where are they going to be?
mbat_data = bbat_chain[109..-1]
# expand the mbat_data to include the linked list forward pointers.
mbat_data = mbat_data.to_enum(:each_slice, @bbat.block_size / 4 - 1).to_a.
zip(mbat_blocks[1..-1] + [nil]).map { |a, b| b ? a + [b] : a }
# pad out the last one.
mbat_data.last.push(*([AllocationTable::AVAIL] * (@bbat.block_size / 4 - mbat_data.last.length)))
RangesIO.open @io, :ranges => @bbat.ranges(mbat_blocks) do |f|
f.write mbat_data.flatten.pack('V*')
end
end
# now seek back and write the header out
@io.seek 0
@io.write @header.to_s + bbat_chain[0, 109].pack('V*')
@io.flush
end
def clear
# initialize to equivalent of loading an empty ole document.
Log.warn 'creating new ole storage object on non-writable io' unless @writeable
@header = Header.new
@bbat = AllocationTable::Big.new self
@root = Dirent.new self, :type => :root, :name => 'Root Entry'
@dirents = [@root]
@root.idx = 0
@sb_file.close if @sb_file
@sb_file = RangesIOResizeable.new @bbat, :first_block => AllocationTable::EOC
@sbat = AllocationTable::Small.new self
# throw everything else the hell away
@io.truncate 0
end
# could be useful with mis-behaving ole documents. or to just clean them up.
def repack temp=:file
case temp
when :file
Tempfile.open 'ole-repack' do |io|
io.binmode
repack_using_io io
end
when :mem; StringIO.open('', &method(:repack_using_io))
else raise ArgumentError, "unknown temp backing #{temp.inspect}"
end
end
def repack_using_io temp_io
@io.rewind
IO.copy @io, temp_io
clear
Storage.open temp_io, nil, @params do |temp_ole|
#temp_ole.root.type = :dir
Dirent.copy temp_ole.root, root
end
end
def bat_for_size size
# note >=, not > previously.
size >= @header.threshold ? @bbat : @sbat
end
def inspect
"#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
end
#
# A class which wraps the ole header
#
# Header.new can be both used to load from a string, or to create from
# defaults. Serialization is accomplished with the #to_s method.
#
class Header < Struct.new(
:magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
:reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
:sbat_start, :num_sbat, :mbat_start, :num_mbat
)
PACK = 'a8 a16 v2 a2 v2 a6 V3 a4 V5'
SIZE = 0x4c
# i have seen it pointed out that the first 4 bytes of hex,
# 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
# what you get if creating new header from scratch.
# AllocationTable::EOC isn't available yet. meh.
EOC = 0xfffffffe
DEFAULT = [
MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
0.chr * 6, 0, 1, EOC, 0.chr * 4,
4096, EOC, 0, EOC, 0
]
def initialize values=DEFAULT
values = values.unpack(PACK) if String === values
super(*values)
validate!
end
def to_s
to_a.pack PACK
end
def validate!
raise FormatError, "OLE2 signature is invalid" unless magic == MAGIC
if num_bat == 0 or # is that valid for a completely empty file?
# not sure about this one. basically to do max possible bat given size of mbat
num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
# shouldn't need to use the mbat as there is enough space in the header block
num_bat < 109 && num_mbat != 0 or
# given the size of the header is 76, if b_shift <= 6, blocks address the header.
s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
# we only handle little endian
byte_order != "\xfe\xff"
raise FormatError, "not valid OLE2 structured storage file"
end
# relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
# 3 for this value.
# transacting_signature != "\x00" * 4 or
if threshold != 4096 or
num_mbat == 0 && mbat_start != AllocationTable::EOC or
reserved != "\x00" * 6
Log.warn "may not be a valid OLE2 structured storage file"
end
true
end
end
#
# +AllocationTable+'s hold the chains corresponding to files. Given
# an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
# the blocks that make up that file.
#
# There are 2 allocation tables, the bbat, and sbat, for big and small
# blocks respectively. The block chain should be loaded using either
# <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
# as appropriate.
#
# Whether or not big or small blocks are used for a file depends on
# whether its size is over the <tt>Header#threshold</tt> level.
#
# An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
# which are stored in blocks throughout the file. The blocks are either
# big or small, and are accessed using the <tt>AllocationTable</tt>.
#
# The bbat allocation table's data is stored in the spare room in the header
# block, and in extra blocks throughout the file as referenced by the meta
# bat. That chain is linear, as there is no higher level table.
#
# AllocationTable.new is used to create an empty table. It can parse a string
# with the #load method. Serialization is accomplished with the #to_s method.
#
class AllocationTable < Array
# a free block (I don't currently leave any blocks free), although I do pad out
# the allocation table with AVAIL to the block size.
AVAIL = 0xffffffff
EOC = 0xfffffffe # end of a chain
# these blocks are used for storing the allocation table chains
BAT = 0xfffffffd
META_BAT = 0xfffffffc
attr_reader :ole, :io, :block_size
def initialize ole
@ole = ole
@sparse = true
super()
end
def load data
replace data.unpack('V*')
end
def truncate
# this strips trailing AVAILs. come to think of it, this has the potential to break
# bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
# very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
# at load time.
temp = reverse
not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
temp.reverse
end
def truncate!
replace truncate
end
def to_s
table = truncate
# pad it out some
num = @ole.bbat.block_size / 4
# do you really use AVAIL? they probably extend past end of file, and may shortly
# be used for the bat. not really good.
table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
table.pack 'V*'
end
# rewrote this to be non-recursive as it broke on a large attachment
# chain with a stack error
def chain idx
a = []
until idx >= META_BAT
raise FormatError, "broken allocationtable chain" if idx < 0 || idx > length
a << idx
idx = self[idx]
end
Log.warn "invalid chain terminator #{idx}" unless idx == EOC
a
end
# Turn a chain (an array given by +chain+) of blocks (optionally
# truncated to +size+) into an array of arrays describing the stretches of
# bytes in the file that it belongs to.
#
# The blocks are Big or Small blocks depending on the table type.
def blocks_to_ranges chain, size=nil
# truncate the chain if required
chain = chain[0...(size.to_f / block_size).ceil] if size
# convert chain to ranges of the block size
ranges = chain.map { |i| [block_size * i, block_size] }
# truncate final range if required
ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
ranges
end
def ranges chain, size=nil
chain = self.chain(chain) unless Array === chain
blocks_to_ranges chain, size
end
# quick shortcut. chain can be either a head (in which case the table is used to
# turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
def open chain, size=nil, &block
RangesIO.open @io, :ranges => ranges(chain, size), &block
end
def read chain, size=nil
open chain, size, &:read
end
# catch any method that may add an AVAIL somewhere in the middle, thus invalidating
# the @sparse speedup for free_block. annoying using eval, but define_method won't
# work for this.
# FIXME
[:map!, :collect!].each do |name|
eval <<-END
def #{name}(*args, &block)
@sparse = true
super
end
END
end
def []= idx, val
@sparse = true if val == AVAIL
super
end
def free_block
if @sparse
i = index(AVAIL) and return i
end
@sparse = false
push AVAIL
length - 1
end
# must return first_block. modifies +blocks+ in place
def resize_chain blocks, size
new_num_blocks = (size / block_size.to_f).ceil
old_num_blocks = blocks.length
if new_num_blocks < old_num_blocks
# de-allocate some of our old blocks. TODO maybe zero them out in the file???
(new_num_blocks...old_num_blocks).each { |i| self[blocks[i]] = AVAIL }
self[blocks[new_num_blocks-1]] = EOC if new_num_blocks > 0
blocks.slice! new_num_blocks..-1
elsif new_num_blocks > old_num_blocks
# need some more blocks.
last_block = blocks.last
(new_num_blocks - old_num_blocks).times do
block = free_block
# connect the chain. handle corner case of blocks being [] initially
self[last_block] = block if last_block
blocks << block
last_block = block
self[last_block] = EOC
end
end
# update ranges, and return that also now
blocks
end
class Big < AllocationTable
def initialize(*args)
super
@block_size = 1 << @ole.header.b_shift
@io = @ole.io
end
# Big blocks are kind of -1 based, in order to not clash with the header.
def blocks_to_ranges blocks, size
super blocks.map { |b| b + 1 }, size
end
end
class Small < AllocationTable
def initialize(*args)
super
@block_size = 1 << @ole.header.s_shift
@io = @ole.sb_file
end
end
end
# like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
# AllocationTable, and can be resized. used for read/write to 2 streams:
# 1. serialized dirent data
# 2. sbat table data
# 3. all dirents but through RangesIOMigrateable below
#
# Note that all internal access to first_block is through accessors, as it is sometimes
# useful to redirect it.
class RangesIOResizeable < RangesIO
attr_reader :bat
attr_accessor :first_block
def initialize bat, mode='r', params={}
mode, params = 'r', mode if Hash === mode
first_block, size = params.values_at :first_block, :size
raise ArgumentError, 'must specify first_block' unless first_block
@bat = bat
self.first_block = first_block
# we now cache the blocks chain, for faster resizing.
@blocks = @bat.chain first_block
super @bat.io, mode, :ranges => @bat.ranges(@blocks, size)
end
def truncate size
# note that old_blocks is != @ranges.length necessarily. i'm planning to write a
# merge_ranges function that merges sequential ranges into one as an optimization.
@bat.resize_chain @blocks, size
@ranges = @bat.ranges @blocks, size
@pos = @size if @pos > size
self.first_block = @blocks.empty? ? AllocationTable::EOC : @blocks.first
# don't know if this is required, but we explicitly request our @io to grow if necessary
# we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
# can be made.
# maybe its ok to just seek out there later??
max = @ranges.map { |pos, len| pos + len }.max || 0
@io.truncate max if max > @io.size
@size = size
end
end
# like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
# between bats based on size, and updating the dirent.
class RangesIOMigrateable < RangesIOResizeable
attr_reader :dirent
def initialize dirent, mode='r'
@dirent = dirent
super @dirent.ole.bat_for_size(@dirent.size), mode,
:first_block => @dirent.first_block, :size => @dirent.size
end
def truncate size
bat = @dirent.ole.bat_for_size size
if bat.class != @bat.class
# bat migration needed! we need to backup some data. the amount of data
# should be <= @ole.header.threshold, so we can just hold it all in one buffer.
# backup this
pos = @pos
@pos = 0
keep = read [@size, size].min
# this does a normal truncate to 0, removing our presence from the old bat, and
# rewrite the dirent's first_block
super 0
@bat = bat
# just change the underlying io from right under everyone :)
@io = bat.io
# important to do this now, before the write. as the below write will always
# migrate us back to sbat! this will now allocate us +size+ in the new bat.
super
@pos = 0
write keep
@pos = pos
else
super
end
# now just update the file
@dirent.size = size
end
# forward this to the dirent
def first_block
@dirent.first_block
end
def first_block= val
@dirent.first_block = val
end
end
#
# A class which wraps an ole directory entry. Can be either a directory
# (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
#
# Most interaction with <tt>Ole::Storage</tt> is through this class.
# The 2 most important functions are <tt>Dirent#children</tt>, and
# <tt>Dirent#data</tt>.
#
# was considering separate classes for dirs and files. some methods/attrs only
# applicable to one or the other.
#
# As with the other classes, #to_s performs the serialization.
#
class Dirent < Struct.new(
:name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
:clsid, :flags, # dirs only
:create_time_str, :modify_time_str, # files only
:first_block, :size, :reserved
)
include RecursivelyEnumerable
PACK = 'a64 v C C V3 a16 V a8 a8 V2 a4'
SIZE = 128
TYPE_MAP = {
# this is temporary
0 => :empty,
1 => :dir,
2 => :file,
5 => :root
}
# something to do with the fact that the tree is supposed to be red-black
COLOUR_MAP = {
0 => :red,
1 => :black
}
# used in the next / prev / child stuff to show that the tree ends here.
# also used for first_block for directory.
EOT = 0xffffffff
DEFAULT = [
0.chr * 2, 2, 0, # will get overwritten
1, EOT, EOT, EOT,
0.chr * 16, 0, nil, nil,
AllocationTable::EOC, 0, 0.chr * 4
]
# i think its just used by the tree building
attr_accessor :idx
# This returns all the children of this +Dirent+. It is filled in
# when the tree structure is recreated.
attr_accessor :children
attr_accessor :name
attr_reader :ole, :type, :create_time, :modify_time
def initialize ole, values=DEFAULT, params={}
@ole = ole
values, params = DEFAULT, values if Hash === values
values = values.unpack(PACK) if String === values
super(*values)
# extra parsing from the actual struct values
@name = params[:name] || Types::Variant.load(Types::VT_LPWSTR, name_utf16[0...name_len])
@type = if params[:type]
unless TYPE_MAP.values.include?(params[:type])
raise ArgumentError, "unknown type #{params[:type].inspect}"
end
params[:type]
else
TYPE_MAP[type_id] or raise FormatError, "unknown type_id #{type_id.inspect}"
end
# further extra type specific stuff
if file?
default_time = @ole.params[:update_timestamps] ? Time.now : nil
@create_time ||= default_time
@modify_time ||= default_time
@create_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if create_time_str
@modify_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if modify_time_str
@children = nil
else
@create_time = nil
@modify_time = nil
self.size = 0 unless @type == :root
@children = []
end
# to silence warnings. used for tree building at load time
# only.
@idx = nil
end
def open mode='r'
raise Errno::EISDIR unless file?
io = RangesIOMigrateable.new self, mode
# TODO work on the mode string stuff a bit more.
# maybe let the io object know about the mode, so it can refuse
# to work for read/write appropriately. maybe redefine all unusable
# methods using singleton class to throw errors.
# for now, i just want to implement truncation on use of 'w'. later,
# i need to do 'a' etc.
case mode
when 'r', 'r+'
# as i don't enforce reading/writing, nothing changes here. kind of
# need to enforce tt if i want modify times to work better.
@modify_time = Time.now if mode == 'r+'
when 'w'
@modify_time = Time.now
# io.truncate 0
#else
# raise NotImplementedError, "unsupported mode - #{mode.inspect}"
end
if block_given?
begin yield io
ensure; io.close
end
else io
end
end
def read limit=nil
open { |io| io.read limit }
end
def file?
type == :file
end
def dir?
# to count root as a dir.
!file?
end
# maybe need some options regarding case sensitivity.
def / name
children.find { |child| name === child.name }
end
def [] idx
if String === idx
#warn 'String form of Dirent#[] is deprecated'
self / idx
else
super
end
end
# move to ruby-msg. and remove from here
def time
#warn 'Dirent#time is deprecated'
create_time || modify_time
end
def each_child(&block)
@children.each(&block)
end
# flattens the tree starting from here into +dirents+. note it modifies its argument.
def flatten dirents=[]
@idx = dirents.length
dirents << self
if file?
self.prev = self.next = self.child = EOT
else
children.each { |child| child.flatten dirents }
self.child = Dirent.flatten_helper children
end
dirents
end
# i think making the tree structure optimized is actually more complex than this, and
# requires some intelligent ordering of the children based on names, but as long as
# it is valid its ok.
# actually, i think its ok. gsf for example only outputs a singly-linked-list, where
# prev is always EOT.
def self.flatten_helper children
return EOT if children.empty?
i = children.length / 2
this = children[i]
this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
this.idx
end
def to_s
tmp = Types::Variant.dump(Types::VT_LPWSTR, name)
tmp = tmp[0, 62] if tmp.length > 62
tmp += 0.chr * 2
self.name_len = tmp.length
self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
# type_id can perhaps be set in the initializer, as its read only now.
self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
# for the case of files, it is assumed that that was handled already
# note not dir?, so as not to override root's first_block
self.first_block = Dirent::EOT if type == :dir
if file?
# this is messed up. it changes the time stamps regardless of whether the file
# was actually touched. instead, any open call with a writeable mode, should update
# the modify time. create time would be set in new.
if @ole.params[:update_timestamps]
self.create_time_str = Types::Variant.dump Types::VT_FILETIME, @create_time
self.modify_time_str = Types::Variant.dump Types::VT_FILETIME, @modify_time
end
else
self.create_time_str = 0.chr * 8
self.modify_time_str = 0.chr * 8
end
to_a.pack PACK
end
def inspect
str = "#<Dirent:#{name.inspect}"
# perhaps i should remove the data snippet. its not that useful anymore.
# there is also some dir specific stuff. like clsid, flags, that i should
# probably include
if file?
tmp = read 9
data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
str << " size=#{size}" +
"#{modify_time ? ' modify_time=' + modify_time.to_s.inspect : nil}" +
" data=#{data.inspect}"
end
str + '>'
end
def delete child
# remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
raise ArgumentError, "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
# free our blocks
child.open { |io| io.truncate 0 }
end
def self.copy src, dst
# copies the contents of src to dst. must be the same type. this will throw an
# error on copying to root. maybe this will recurse too much for big documents??
raise ArgumentError, 'differing types' if src.file? and !dst.file?
dst.name = src.name
if src.dir?
src.children.each do |src_child|
dst_child = Dirent.new dst.ole, :type => src_child.type
dst.children << dst_child
Dirent.copy src_child, dst_child
end
else
src.open do |src_io|
dst.open { |dst_io| IO.copy src_io, dst_io }
end
end
end
end
end
end

View File

@ -0,0 +1,423 @@
#
# = Introduction
#
# This file intends to provide file system-like api support, a la <tt>zip/zipfilesystem</tt>.
#
# = TODO
#
# - need to implement some more IO functions on RangesIO, like #puts, #print
# etc, like AbstractOutputStream from zipfile.
#
# - check Dir.mkdir, and File.open, and File.rename, to add in filename
# length checks (max 32 / 31 or something).
# do the automatic truncation, and add in any necessary warnings.
#
# - File.split('a/') == File.split('a') == ['.', 'a']
# the implication of this, is that things that try to force directory
# don't work. like, File.rename('a', 'b'), should work if a is a file
# or directory, but File.rename('a/', 'b') should only work if a is
# a directory. tricky, need to clean things up a bit more.
# i think a general path name => dirent method would work, with flags
# about what should raise an error.
#
# - Need to look at streamlining things after getting all the tests passing,
# as this file's getting pretty long - almost half the real implementation.
# and is probably more inefficient than necessary.
# too many exceptions in the expected path of certain functions.
#
# - should look at profiles before and after switching ruby-msg to use
# the filesystem api.
#
require 'ole/storage'
module Ole # :nodoc:
class Storage
def file
@file ||= FileClass.new self
end
def dir
@dir ||= DirClass.new self
end
# tries to get a dirent for path. return nil if it doesn't exist
# (change it)
def dirent_from_path path
dirent = @root
path = file.expand_path path
path = path.sub(/^\/*/, '').sub(/\/*$/, '').split(/\/+/)
until path.empty?
return nil if dirent.file?
return nil unless dirent = dirent/path.shift
end
dirent
end
class FileClass
class Stat
attr_reader :ftype, :size, :blocks, :blksize
attr_reader :nlink, :uid, :gid, :dev, :rdev, :ino
def initialize dirent
@dirent = dirent
@size = dirent.size
if file?
@ftype = 'file'
bat = dirent.ole.bat_for_size(dirent.size)
@blocks = bat.chain(dirent.first_block).length
@blksize = bat.block_size
else
@ftype = 'directory'
@blocks = 0
@blksize = 0
end
# a lot of these are bogus. ole file format has no analogs
@nlink = 1
@uid, @gid = 0, 0
@dev, @rdev = 0, 0
@ino = 0
# need to add times - atime, mtime, ctime.
end
alias rdev_major :rdev
alias rdev_minor :rdev
def file?
@dirent.file?
end
def directory?
@dirent.dir?
end
def size?
size if file?
end
def inspect
pairs = (instance_variables - ['@dirent']).map do |n|
"#{n[1..-1]}=#{instance_variable_get n}"
end
"#<#{self.class} #{pairs * ', '}>"
end
end
def initialize ole
@ole = ole
end
def expand_path path
# get the raw stored pwd value (its blank for root)
pwd = @ole.dir.instance_variable_get :@pwd
# its only absolute if it starts with a '/'
path = "#{pwd}/#{path}" unless path =~ /^\//
# at this point its already absolute. we use File.expand_path
# just for the .. and . handling
# No longer use RUBY_PLATFORM =~ /win/ as it matches darwin. better way?
File.expand_path(path)[File::ALT_SEPARATOR == "\\" ? (2..-1) : (0..-1)]
end
# +orig_path+ is just so that we can use the requested path
# in the error messages even if it has been already modified
def dirent_from_path path, orig_path=nil
orig_path ||= path
dirent = @ole.dirent_from_path path
raise Errno::ENOENT, orig_path unless dirent
raise Errno::EISDIR, orig_path if dirent.dir?
dirent
end
private :dirent_from_path
def exists? path
!!@ole.dirent_from_path(path)
end
alias exist? :exists?
def file? path
dirent = @ole.dirent_from_path path
dirent and dirent.file?
end
def directory? path
dirent = @ole.dirent_from_path path
dirent and dirent.dir?
end
def open path, mode='r', &block
if IO::Mode.new(mode).create?
begin
dirent = dirent_from_path path
rescue Errno::ENOENT
# maybe instead of repeating this everywhere, i should have
# a get_parent_dirent function.
parent_path, basename = File.split expand_path(path)
parent = @ole.dir.send :dirent_from_path, parent_path, path
parent.children << dirent = Dirent.new(@ole, :type => :file, :name => basename)
end
else
dirent = dirent_from_path path
end
dirent.open mode, &block
end
# explicit wrapper instead of alias to inhibit block
def new path, mode='r'
open path, mode
end
def size path
dirent_from_path(path).size
rescue Errno::EISDIR
# kind of arbitrary. I'm getting 4096 from ::File, but
# the zip tests want 0.
0
end
def size? path
dirent_from_path(path).size
# any other exceptions i need to rescue?
rescue Errno::ENOENT, Errno::EISDIR
nil
end
def stat path
# we do this to allow dirs.
dirent = @ole.dirent_from_path path
raise Errno::ENOENT, path unless dirent
Stat.new dirent
end
def read path
open path, &:read
end
# most of the work this function does is moving the dirent between
# 2 parents. the actual name changing is quite simple.
# File.rename can move a file into another folder, which is why i've
# done it too, though i think its not always possible...
#
# FIXME File.rename can be used for directories too....
def rename from_path, to_path
# check what we want to rename from exists. do it this
# way to allow directories.
dirent = @ole.dirent_from_path from_path
raise Errno::ENOENT, from_path unless dirent
# delete what we want to rename to if necessary
begin
unlink to_path
rescue Errno::ENOENT
# we actually get here, but rcov doesn't think so. add 1 + 1 to
# keep rcov happy for now... :)
1 + 1
end
# reparent the dirent
from_parent_path, from_basename = File.split expand_path(from_path)
to_parent_path, to_basename = File.split expand_path(to_path)
from_parent = @ole.dir.send :dirent_from_path, from_parent_path, from_path
to_parent = @ole.dir.send :dirent_from_path, to_parent_path, to_path
from_parent.children.delete dirent
# and also change its name
dirent.name = to_basename
to_parent.children << dirent
0
end
# crappy copy from Dir.
def unlink(*paths)
paths.each do |path|
dirent = @ole.dirent_from_path path
# i think we should free all of our blocks from the
# allocation table.
# i think if you run repack, all free blocks should get zeroed,
# but currently the original data is there unmodified.
open(path) { |f| f.truncate 0 }
# remove ourself from our parent, so we won't be part of the dir
# tree at save time.
parent_path, basename = File.split expand_path(path)
parent = @ole.dir.send :dirent_from_path, parent_path, path
parent.children.delete dirent
end
paths.length # hmmm. as per ::File ?
end
alias delete :unlink
end
#
# an *instance* of this class is supposed to provide similar methods
# to the class methods of Dir itself.
#
# pretty complete. like zip/zipfilesystem's implementation, i provide
# everything except chroot and glob. glob could be done with a glob
# to regex regex, and then simply match in the entries array... although
# recursive glob complicates that somewhat.
#
# Dir.chroot, Dir.glob, Dir.[], and Dir.tmpdir is the complete list.
class DirClass
def initialize ole
@ole = ole
@pwd = ''
end
# +orig_path+ is just so that we can use the requested path
# in the error messages even if it has been already modified
def dirent_from_path path, orig_path=nil
orig_path ||= path
dirent = @ole.dirent_from_path path
raise Errno::ENOENT, orig_path unless dirent
raise Errno::ENOTDIR, orig_path unless dirent.dir?
dirent
end
private :dirent_from_path
def open path
dir = Dir.new path, entries(path)
if block_given?
yield dir
else
dir
end
end
# as for file, explicit alias to inhibit block
def new path
open path
end
# pwd is always stored without the trailing slash. we handle
# the root case here
def pwd
if @pwd.empty?
'/'
else
@pwd
end
end
alias getwd :pwd
def chdir orig_path
# make path absolute, squeeze slashes, and remove trailing slash
path = @ole.file.expand_path(orig_path).gsub(/\/+/, '/').sub(/\/$/, '')
# this is just for the side effects of the exceptions if invalid
dirent_from_path path, orig_path
if block_given?
old_pwd = @pwd
begin
@pwd = path
yield
ensure
@pwd = old_pwd
end
else
@pwd = path
0
end
end
def entries path
dirent = dirent_from_path path
# Not sure about adding on the dots...
entries = %w[. ..] + dirent.children.map(&:name)
# do some checks about un-reachable files
seen = {}
entries.each do |n|
Log.warn "inaccessible file (filename contains slash) - #{n.inspect}" if n['/']
Log.warn "inaccessible file (duplicate filename) - #{n.inspect}" if seen[n]
seen[n] = true
end
entries
end
def foreach path, &block
entries(path).each(&block)
end
# there are some other important ones, like:
# chroot (!), glob etc etc. for now, i think
def mkdir path
# as for rmdir below:
parent_path, basename = File.split @ole.file.expand_path(path)
# note that we will complain about the full path despite accessing
# the parent path. this is consistent with ::Dir
parent = dirent_from_path parent_path, path
# now, we first should ensure that it doesn't already exist
# either as a file or a directory.
raise Errno::EEXIST, path if parent/basename
parent.children << Dirent.new(@ole, :type => :dir, :name => basename)
0
end
def rmdir path
dirent = dirent_from_path path
raise Errno::ENOTEMPTY, path unless dirent.children.empty?
# now delete it, how to do that? the canonical representation that is
# maintained is the root tree, and the children array. we must remove it
# from the children array.
# we need the parent then. this sucks but anyway:
# we need to split the path. but before we can do that, we need
# to expand it first. eg. say we need the parent to unlink
# a/b/../c. the parent should be a, not a/b/.., or a/b.
parent_path, basename = File.split @ole.file.expand_path(path)
# this shouldn't be able to fail if the above didn't
parent = dirent_from_path parent_path
# note that the way this currently works, on save and repack time this will get
# reflected. to work properly, ie to make a difference now it would have to re-write
# the dirent. i think that Ole::Storage#close will handle that. and maybe include a
# #repack.
parent.children.delete dirent
0 # hmmm. as per ::Dir ?
end
alias delete :rmdir
alias unlink :rmdir
# note that there is nothing remotely ole specific about
# this class. it simply provides the dir like sequential access
# methods on top of an array.
# hmm, doesn't throw the IOError's on use of a closed directory...
class Dir
include Enumerable
attr_reader :path
def initialize path, entries
@path, @entries, @pos = path, entries, 0
@closed = false
end
def pos
raise IOError if @closed
@pos
end
def each(&block)
raise IOError if @closed
@entries.each(&block)
end
def close
@closed = true
end
def read
raise IOError if @closed
@entries[pos]
ensure
@pos += 1 if pos < @entries.length
end
def pos= pos
raise IOError if @closed
@pos = [[0, pos].max, @entries.length].min
end
def rewind
raise IOError if @closed
@pos = 0
end
alias tell :pos
alias seek :pos=
end
end
end
end

View File

@ -0,0 +1,148 @@
require 'ole/types/property_set'
module Ole
class Storage
#
# The MetaData class is designed to be high level interface to all the
# underlying meta data stored within different sections, themselves within
# different property set streams.
#
# With this class, you can simply get properties using their names, without
# needing to know about the underlying guids, property ids etc.
#
# Example:
#
# Ole::Storage.open('test.doc') { |ole| p ole.meta_data.doc_author }
#
# TODO:
#
# * add write support
# * fix some of the missing type coercion (eg FileTime)
# * maybe add back the ability to access individual property sets as a unit
# directly. ie <tt>ole.summary_information</tt>. Is this useful?
# * full key support, for unknown keys, like
# <tt>ole.meta_data[myguid, myid]</tt>. probably needed for user-defined
# properties too.
#
class MetaData
include Enumerable
FILE_MAP = {
Types::PropertySet::FMTID_SummaryInformation => "\005SummaryInformation",
Types::PropertySet::FMTID_DocSummaryInfo => "\005DocumentSummaryInformation"
}
FORMAT_MAP = {
'MSWordDoc' => :doc
}
CLSID_EXCEL97 = Types::Clsid.parse "{00020820-0000-0000-c000-000000000046}"
CLSID_EXCEL95 = Types::Clsid.parse "{00020810-0000-0000-c000-000000000046}"
CLSID_WORD97 = Types::Clsid.parse "{00020906-0000-0000-c000-000000000046}"
CLSID_WORD95 = Types::Clsid.parse "{00020900-0000-0000-c000-000000000046}"
CLSID_MAP = {
CLSID_EXCEL97 => :xls,
CLSID_EXCEL95 => :xls,
CLSID_WORD97 => :doc,
CLSID_WORD95 => :doc
}
MIME_TYPES = {
:xls => 'application/vnd.ms-excel',
:doc => 'application/msword',
:ppt => 'application/vnd.ms-powerpoint',
# not registered at IANA, but seems most common usage
:msg => 'application/vnd.ms-outlook',
# this is my default fallback option. also not registered at IANA.
# file(1)'s default is application/msword, which is useless...
nil => 'application/x-ole-storage'
}
def initialize ole
@ole = ole
end
# i'm thinking of making file_format and mime_type available through
# #[], #each, and #to_h also, as calculated meta data (not assignable)
def comp_obj
return {} unless dirent = @ole.root["\001CompObj"]
data = dirent.read
# see - https://gnunet.org/svn/Extractor/doc/StarWrite_File_Format.html
# compobj_version: 0x0001
# byte_order: 0xffe
# windows_version: 0x00000a03 (win31 apparently)
# marker: 0xffffffff
compobj_version, byte_order, windows_version, marker, clsid =
data.unpack("vvVVa#{Types::Clsid::SIZE}")
strings = []
i = 28
while i < data.length
len = data[i, 4].unpack('V').first
i += 4
strings << data[i, len - 1]
i += len
end
# in the unknown chunk, you usually see something like 'Word.Document.6'
{:username => strings[0], :file_format => strings[1], :unknown => strings[2..-1]}
end
private :comp_obj
def file_format
comp_obj[:file_format]
end
def mime_type
# based on the CompObj stream contents
type = FORMAT_MAP[file_format]
return MIME_TYPES[type] if type
# based on the root clsid
type = CLSID_MAP[Types::Clsid.load(@ole.root.clsid)]
return MIME_TYPES[type] if type
# fallback to heuristics
has_file = Hash[*@ole.root.children.map { |d| [d.name.downcase, true] }.flatten]
return MIME_TYPES[:msg] if has_file['__nameid_version1.0'] or has_file['__properties_version1.0']
return MIME_TYPES[:doc] if has_file['worddocument'] or has_file['document']
return MIME_TYPES[:xls] if has_file['workbook'] or has_file['book']
MIME_TYPES[nil]
end
def [] key
pair = Types::PropertySet::PROPERTY_MAP[key.to_s] or return nil
file = FILE_MAP[pair.first] or return nil
dirent = @ole.root[file] or return nil
dirent.open { |io| return Types::PropertySet.new(io)[key] }
end
def []= key, value
raise NotImplementedError, 'meta data writes not implemented'
end
def each(&block)
FILE_MAP.values.each do |file|
dirent = @ole.root[file] or next
dirent.open { |io| Types::PropertySet.new(io).each(&block) }
end
end
def to_h
inject({}) { |hash, (name, value)| hash.update name.to_sym => value }
end
def method_missing name, *args, &block
return super unless args.empty?
pair = Types::PropertySet::PROPERTY_MAP[name.to_s] or return super
self[name]
end
end
def meta_data
@meta_data ||= MetaData.new(self)
end
end
end

View File

@ -1,40 +1,73 @@
#! /usr/bin/ruby
#
# A file with general support functions used by most files in the project.
#
# These are the only methods added to other classes.
#
require 'logger'
require 'stringio'
require 'enumerator'
class String # :nodoc:
# plural of String#index. returns all offsets of +string+. rename to indices?
#
# note that it doesn't check for overlapping values.
def indexes string
# in some ways i'm surprised that $~ works properly in this case...
to_enum(:scan, /#{Regexp.quote string}/m).map { $~.begin 0 }
end
def each_chunk size
(length / size.to_f).ceil.times { |i| yield self[i * size, size] }
end
end
class File # :nodoc:
# for consistency with StringIO and others. makes more sense than forcing
# them to provide a #stat
# for interface consistency with StringIO etc (rather than adding #stat
# to them). used by RangesIO.
def size
stat.size
end
end
class Symbol # :nodoc:
def to_proc
proc { |a| a.send self }
unless :x.respond_to? :to_proc
def to_proc
proc { |a| a.send self }
end
end
end
module Enumerable # :nodoc:
# 1.9 backport
def group_by
hash = Hash.new { |hash, key| hash[key] = [] }
each { |item| hash[yield(item)] << item }
hash
unless [].respond_to? :group_by
# 1.9 backport
def group_by
hash = Hash.new { |h, key| h[key] = [] }
each { |item| hash[yield(item)] << item }
hash
end
end
def sum initial=0
inject(initial) { |a, b| a + b }
unless [].respond_to? :sum
def sum initial=0
inject(initial) { |a, b| a + b }
end
end
end
# move to support?
class IO # :nodoc:
# Copy data from IO-like object +src+, to +dst+
def self.copy src, dst
until src.eof?
buf = src.read(4096)
dst.write buf
end
end
end
class Logger # :nodoc:
# A helper method for creating <tt>Logger</tt>s which produce call stack
# A helper method for creating a +Logger+ which produce call stack
# in their output
def self.new_with_callstack logdev=STDERR
log = Logger.new logdev
@ -48,4 +81,176 @@ class Logger # :nodoc:
end
log
end
end
end
# Include this module into a class that defines #each_child. It should
# maybe use #each instead, but its easier to be more specific, and use
# an alias.
#
# I don't want to force the class to cache children (eg where children
# are loaded on request in pst), because that forces the whole tree to
# be loaded. So, the methods should only call #each_child once, and
# breadth first iteration holds its own copy of the children around.
#
# Main methods are #recursive, and #to_tree
module RecursivelyEnumerable # :nodoc:
def each_recursive_depth_first(&block)
each_child do |child|
yield child
if child.respond_to? :each_recursive_depth_first
child.each_recursive_depth_first(&block)
end
end
end
# don't think this is actually a proper breadth first recursion. only first
# level is breadth first.
def each_recursive_breadth_first(&block)
children = []
each_child do |child|
children << child if child.respond_to? :each_recursive_breadth_first
yield child
end
children.each { |child| child.each_recursive_breadth_first(&block) }
end
def each_recursive mode=:depth_first, &block
# we always actually yield ourself (the tree root) before recursing
yield self
send "each_recursive_#{mode}", &block
end
# the idea of this function, is to allow use of regular Enumerable methods
# in a recursive fashion. eg:
#
# # just looks at top level children
# root.find { |child| child.some_condition? }
# # recurse into all children getting non-folders, breadth first
# root.recursive(:breadth_first).select { |child| !child.folder? }
# # just get everything
# items = root.recursive.to_a
#
def recursive mode=:depth_first
to_enum(:each_recursive, mode)
end
# streams a "tree" form of the recursively enumerable structure to +io+, or
# return a string form instead if +io+ is not specified.
#
# mostly a debugging aid. can specify a different block which will be called
# to provide the string form for each node.
def to_tree io='', &inspect
inspect ||= :inspect.to_proc
io << "- #{inspect[self]}\n"
recurse = proc do |node, prefix|
child = nil
node.each_child do |next_child|
if child
io << "#{prefix}|- #{inspect[child]}\n"
recurse.call child, prefix + '| '
end
child = next_child
end if node.respond_to?(:each_child)
if child
io << "#{prefix}\\- #{inspect[child]}\n"
recurse.call child, prefix + ' '
end
end
recurse.call self, ' '
io
end
end
# can include File::Constants
class IO
# this is for jruby
include File::Constants unless defined?(RDONLY)
# nabbed from rubinius, and modified
def self.parse_mode mode
ret = 0
case mode[0, 1]
when 'r'; ret |= RDONLY
when 'w'; ret |= WRONLY | CREAT | TRUNC
when 'a'; ret |= WRONLY | CREAT | APPEND
else raise ArgumentError, "illegal access mode #{mode}"
end
(1...mode.length).each do |i|
case mode[i, 1]
when '+'; ret = (ret & ~(RDONLY | WRONLY)) | RDWR
when 'b'; ret |= Mode::BINARY
else raise ArgumentError, "illegal access mode #{mode}"
end
end
ret
end
class Mode
# ruby 1.9 defines binary as 0, which isn't very helpful.
# its 4 in rubinius. no longer using
#
# BINARY = 0x4 unless defined?(BINARY)
#
# for that reason, have my own constants module here
module Constants
include File::Constants
BINARY = 0x4
end
include Constants
NAMES = %w[rdonly wronly rdwr creat trunc append binary]
attr_reader :flags
def initialize flags
flags = IO.parse_mode flags.to_str if flags.respond_to? :to_str
raise ArgumentError, "invalid flags - #{flags.inspect}" unless Fixnum === flags
@flags = flags
end
def writeable?
#(@flags & RDONLY) == 0
(@flags & 0x3) != RDONLY
end
def readable?
(@flags & WRONLY) == 0
end
def truncate?
(@flags & TRUNC) != 0
end
def append?
(@flags & APPEND) != 0
end
def create?
(@flags & CREAT) != 0
end
def binary?
(@flags & BINARY) != 0
end
=begin
# revisit this
def apply io
if truncate?
io.truncate 0
elsif append?
io.seek IO::SEEK_END, 0
end
end
=end
def inspect
names = NAMES.map { |name| name if (flags & Mode.const_get(name.upcase)) != 0 }
names.unshift 'rdonly' if (flags & 0x3) == 0
"#<#{self.class} #{names.compact * '|'}>"
end
end
end

View File

@ -1,27 +1,2 @@
require 'ole/base'
module Ole # :nodoc:
# FIXME
module Types
# Parse two 32 bit time values into a DateTime
# Time is stored as a high and low 32 bit value, comprising the
# 100's of nanoseconds since 1st january 1601 (Epoch).
# struct FILETIME. see eg http://msdn2.microsoft.com/en-us/library/ms724284.aspx
def self.load_time str
low, high = str.unpack 'L2'
time = EPOCH + (high * (1 << 32) + low) * 1e-7 / 86400 rescue return
# extra sanity check...
unless (1800...2100) === time.year
Log.warn "ignoring unlikely time value #{time.to_s}"
return nil
end
time
end
# turn a binary guid into something displayable.
# this will probably become a proper class later
def self.load_guid str
"{%08x-%04x-%04x-%02x%02x-#{'%02x' * 6}}" % str.unpack('L S S CC C6')
end
end
end
require 'ole/types/base'
require 'ole/types/property_set'

251
lib/ole/types/base.rb Normal file
View File

@ -0,0 +1,251 @@
require 'iconv'
require 'date'
require 'ole/base'
module Ole # :nodoc:
#
# The Types module contains all the serialization and deserialization code for standard ole
# types.
#
# It also defines all the variant type constants, and symbolic names.
#
module Types
# for anything that we don't have serialization code for
class Data < String
def self.load str
new str
end
def self.dump str
str.to_s
end
end
class Lpstr < String
def self.load str
# not sure if its always there, but there is often a trailing
# null byte.
new str.chomp(0.chr)
end
def self.dump str
# do i need to append the null byte?
str.to_s
end
end
# for VT_LPWSTR
class Lpwstr < String
FROM_UTF16 = Iconv.new 'utf-8', 'utf-16le'
TO_UTF16 = Iconv.new 'utf-16le', 'utf-8'
def self.load str
new FROM_UTF16.iconv(str).chomp(0.chr)
end
def self.dump str
# need to append nulls?
data = TO_UTF16.iconv str
# not sure if this is the recommended way to do it, but I want to treat
# the resulting utf16 data as regular bytes, not characters.
data.force_encoding Encoding::US_ASCII if data.respond_to? :encoding
data
end
end
# for VT_FILETIME
class FileTime < DateTime
SIZE = 8
EPOCH = new 1601, 1, 1
# Create a +DateTime+ object from a struct +FILETIME+
# (http://msdn2.microsoft.com/en-us/library/ms724284.aspx).
#
# Converts +str+ to two 32 bit time values, comprising the high and low 32 bits of
# the 100's of nanoseconds since 1st january 1601 (Epoch).
def self.load str
low, high = str.to_s.unpack 'V2'
# we ignore these, without even warning about it
return nil if low == 0 and high == 0
# switched to rational, and fixed the off by 1 second error i sometimes got.
# time = EPOCH + (high * (1 << 32) + low) / 1e7 / 86400 rescue return
# use const_get to ensure we can return anything which subclasses this (VT_DATE?)
const_get('EPOCH') + Rational(high * (1 << 32) + low, 1e7.to_i * 86400) rescue return
# extra sanity check...
#unless (1800...2100) === time.year
# Log.warn "ignoring unlikely time value #{time.to_s}"
# return nil
#end
#time
end
# +time+ should be able to be either a Time, Date, or DateTime.
def self.dump time
# i think i'll convert whatever i get to be a datetime, because of
# the covered range.
return 0.chr * SIZE unless time
time = time.send(:to_datetime) if Time === time
# don't bother to use const_get here
bignum = (time - EPOCH) * 86400 * 1e7.to_i
high, low = bignum.divmod 1 << 32
[low, high].pack 'V2'
end
def inspect
"#<#{self.class} #{to_s}>"
end
end
# for VT_CLSID
# Unlike most of the other conversions, the Guid's are serialized/deserialized by actually
# doing nothing! (eg, _load & _dump are null ops)
# Rather, its just a string with a different inspect string, and it includes a
# helper method for creating a Guid from that readable form (#format).
class Clsid < String
SIZE = 16
PACK = 'V v v CC C6'
def self.load str
new str.to_s
end
def self.dump guid
return 0.chr * SIZE unless guid
# allow use of plain strings in place of guids.
guid['-'] ? parse(guid) : guid
end
def self.parse str
vals = str.scan(/[a-f\d]+/i).map(&:hex)
if vals.length == 5
# this is pretty ugly
vals[3] = ('%04x' % vals[3]).scan(/../).map(&:hex)
vals[4] = ('%012x' % vals[4]).scan(/../).map(&:hex)
guid = new vals.flatten.pack(PACK)
return guid if guid.format.delete('{}') == str.downcase.delete('{}')
end
raise ArgumentError, 'invalid guid - %p' % str
end
def format
"%08x-%04x-%04x-%02x%02x-#{'%02x' * 6}" % unpack(PACK)
end
def inspect
"#<#{self.class}:{#{format}}>"
end
end
#
# The OLE variant types, extracted from
# http://www.marin.clara.net/COM/variant_type_definitions.htm.
#
# A subset is also in WIN32OLE::VARIANT, but its not cross platform (obviously).
#
# Use like:
#
# p Ole::Types::Variant::NAMES[0x001f] => 'VT_LPWSTR'
# p Ole::Types::VT_DATE # => 7
#
# The serialization / deserialization functions should be fixed to make it easier
# to work with. like
#
# Ole::Types.from_str(VT_DATE, data) # and
# Ole::Types.to_str(VT_DATE, data)
#
# Or similar, rather than having to do VT_* <=> ad hoc class name etc as it is
# currently.
#
module Variant
NAMES = {
0x0000 => 'VT_EMPTY',
0x0001 => 'VT_NULL',
0x0002 => 'VT_I2',
0x0003 => 'VT_I4',
0x0004 => 'VT_R4',
0x0005 => 'VT_R8',
0x0006 => 'VT_CY',
0x0007 => 'VT_DATE',
0x0008 => 'VT_BSTR',
0x0009 => 'VT_DISPATCH',
0x000a => 'VT_ERROR',
0x000b => 'VT_BOOL',
0x000c => 'VT_VARIANT',
0x000d => 'VT_UNKNOWN',
0x000e => 'VT_DECIMAL',
0x0010 => 'VT_I1',
0x0011 => 'VT_UI1',
0x0012 => 'VT_UI2',
0x0013 => 'VT_UI4',
0x0014 => 'VT_I8',
0x0015 => 'VT_UI8',
0x0016 => 'VT_INT',
0x0017 => 'VT_UINT',
0x0018 => 'VT_VOID',
0x0019 => 'VT_HRESULT',
0x001a => 'VT_PTR',
0x001b => 'VT_SAFEARRAY',
0x001c => 'VT_CARRAY',
0x001d => 'VT_USERDEFINED',
0x001e => 'VT_LPSTR',
0x001f => 'VT_LPWSTR',
0x0040 => 'VT_FILETIME',
0x0041 => 'VT_BLOB',
0x0042 => 'VT_STREAM',
0x0043 => 'VT_STORAGE',
0x0044 => 'VT_STREAMED_OBJECT',
0x0045 => 'VT_STORED_OBJECT',
0x0046 => 'VT_BLOB_OBJECT',
0x0047 => 'VT_CF',
0x0048 => 'VT_CLSID',
0x0fff => 'VT_ILLEGALMASKED',
0x0fff => 'VT_TYPEMASK',
0x1000 => 'VT_VECTOR',
0x2000 => 'VT_ARRAY',
0x4000 => 'VT_BYREF',
0x8000 => 'VT_RESERVED',
0xffff => 'VT_ILLEGAL'
}
CLASS_MAP = {
# haven't seen one of these. wonder if its same as FILETIME?
#'VT_DATE' => ?,
'VT_LPSTR' => Lpstr,
'VT_LPWSTR' => Lpwstr,
'VT_FILETIME' => FileTime,
'VT_CLSID' => Clsid
}
module Constants
NAMES.each { |num, name| const_set name, num }
end
def self.load type, str
type = NAMES[type] or raise ArgumentError, 'unknown ole type - 0x%04x' % type
(CLASS_MAP[type] || Data).load str
end
def self.dump type, variant
type = NAMES[type] or raise ArgumentError, 'unknown ole type - 0x%04x' % type
(CLASS_MAP[type] || Data).dump variant
end
end
include Variant::Constants
# deprecated aliases, kept mostly for the benefit of ruby-msg, until
# i release a new version.
def self.load_guid str
Variant.load VT_CLSID, str
end
def self.load_time str
Variant.load VT_FILETIME, str
end
FROM_UTF16 = Lpwstr::FROM_UTF16
TO_UTF16 = Lpwstr::TO_UTF16
end
end

View File

@ -0,0 +1,165 @@
require 'ole/types'
require 'yaml'
module Ole
module Types
#
# The PropertySet class currently supports readonly access to the properties
# serialized in "property set" streams, such as the file "\005SummaryInformation",
# in OLE files.
#
# Think it has its roots in MFC property set serialization.
#
# See http://poi.apache.org/hpsf/internals.html for details
#
class PropertySet
HEADER_SIZE = 28
HEADER_PACK = "vvVa#{Clsid::SIZE}V"
OS_MAP = {
0 => :win16,
1 => :mac,
2 => :win32,
0x20001 => :ooffice, # open office on linux...
}
# define a smattering of the property set guids.
DATA = YAML.load_file(File.dirname(__FILE__) + '/../../../data/propids.yaml').
inject({}) { |hash, (key, value)| hash.update Clsid.parse(key) => value }
# create an inverted map of names to guid/key pairs
PROPERTY_MAP = DATA.inject({}) do |h1, (guid, data)|
data[1].inject(h1) { |h2, (id, name)| h2.update name => [guid, id] }
end
module Constants
DATA.each { |guid, (name, map)| const_set name, guid }
end
include Constants
include Enumerable
class Section
include Variant::Constants
include Enumerable
SIZE = Clsid::SIZE + 4
PACK = "a#{Clsid::SIZE}v"
attr_accessor :guid, :offset
attr_reader :length
def initialize str, property_set
@property_set = property_set
@guid, @offset = str.unpack PACK
self.guid = Clsid.load guid
load_header
end
def io
@property_set.io
end
def load_header
io.seek offset
@byte_size, @length = io.read(8).unpack 'V2'
end
def [] key
each_raw do |id, property_offset|
return read_property(property_offset).last if key == id
end
nil
end
def []= key, value
raise NotImplementedError, 'section writes not yet implemented'
end
def each
each_raw do |id, property_offset|
yield id, read_property(property_offset).last
end
end
private
def each_raw
io.seek offset + 8
io.read(length * 8).each_chunk(8) { |str| yield(*str.unpack('V2')) }
end
def read_property property_offset
io.seek offset + property_offset
type, value = io.read(8).unpack('V2')
# is the method of serialization here custom?
case type
when VT_LPSTR, VT_LPWSTR
value = Variant.load type, io.read(value)
# ....
end
[type, value]
end
end
attr_reader :io, :signature, :unknown, :os, :guid, :sections
def initialize io
@io = io
load_header io.read(HEADER_SIZE)
load_section_list io.read(@num_sections * Section::SIZE)
# expect no gap between last section and start of data.
#Log.warn "gap between section list and property data" unless io.pos == @sections.map(&:offset).min
end
def load_header str
@signature, @unknown, @os_id, @guid, @num_sections = str.unpack HEADER_PACK
# should i check that unknown == 0? it usually is. so is the guid actually
@guid = Clsid.load @guid
@os = OS_MAP[@os_id] || Log.warn("unknown operating system id #{@os_id}")
end
def load_section_list str
@sections = str.to_enum(:each_chunk, Section::SIZE).map { |s| Section.new s, self }
end
def [] key
pair = PROPERTY_MAP[key.to_s] or return nil
section = @sections.find { |s| s.guid == pair.first } or return nil
section[pair.last]
end
def []= key, value
pair = PROPERTY_MAP[key.to_s] or return nil
section = @sections.find { |s| s.guid == pair.first } or return nil
section[pair.last] = value
end
def method_missing name, *args, &block
if name.to_s =~ /(.*)=$/
return super unless args.length == 1
return super unless PROPERTY_MAP[$1]
self[$1] = args.first
else
return super unless args.length == 0
return super unless PROPERTY_MAP[name.to_s]
self[name]
end
end
def each
@sections.each do |section|
next unless pair = DATA[section.guid]
map = pair.last
section.each do |id, value|
name = map[id] or next
yield name, value
end
end
end
def to_h
inject({}) { |hash, (name, value)| hash.update name.to_sym => value }
end
end
end
end