Big update from the ruby-ole trunk
git-svn-id: file:///home/svn/framework3/trunk@6522 4d416f70-5f16-0410-b530-b9f4589650da
This commit is contained in:
parent
4bafe57fe3
commit
43a03aa307
|
@ -1,5 +1,7 @@
|
|||
|
||||
module Ole # :nodoc:
|
||||
require 'ole/support'
|
||||
Log = Logger.new_with_callstack
|
||||
end
|
||||
|
||||
require 'ole/support'
|
||||
|
||||
module Ole # :nodoc:
|
||||
Log = Logger.new_with_callstack
|
||||
end
|
||||
|
||||
|
|
|
@ -1,169 +1,2 @@
|
|||
=begin
|
||||
|
||||
full file_system module
|
||||
will be available and recommended usage, allowing Ole::Storage, Dir, and Zip::ZipFile to be
|
||||
used pretty exchangably down the track. should be possible to write a recursive copy using
|
||||
the plain api, such that you can copy dirs/files agnostically between any of ole docs, dirs,
|
||||
and zip files.
|
||||
|
||||
i think its okay to have an api like this on top, but there are certain things that ole
|
||||
does that aren't captured.
|
||||
ole::storage can have multiple files with the same name, for example, or with / in the
|
||||
name, and other things that are probably invalid anyway.
|
||||
i think this should remain an addon, built on top of my core api.
|
||||
but still the ideas can be reflected in the core, ie, changing the read/write semantics.
|
||||
|
||||
once the core changes are complete, this will be a pretty straight forward file to complete.
|
||||
|
||||
=end
|
||||
|
||||
module Ole
|
||||
class Storage
|
||||
def file
|
||||
@file ||= FileParent.new self
|
||||
end
|
||||
|
||||
def dir
|
||||
@dir ||= DirParent.new self
|
||||
end
|
||||
|
||||
def dirent_from_path path_str
|
||||
path = path_str.sub(/^\/*/, '').sub(/\/*$/, '')
|
||||
dirent = @root
|
||||
return dirent if path.empty?
|
||||
path = path.split /\/+/
|
||||
until path.empty?
|
||||
raise "invalid path #{path_str.inspect}" if dirent.file?
|
||||
if tmp = dirent[path.shift]
|
||||
dirent = tmp
|
||||
else
|
||||
# allow write etc later.
|
||||
raise "invalid path #{path_str.inspect}"
|
||||
end
|
||||
end
|
||||
dirent
|
||||
end
|
||||
|
||||
class FileParent
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
end
|
||||
|
||||
def open path_str, mode='r'
|
||||
dirent = @ole.dirent_from_path path_str
|
||||
# like Errno::EISDIR
|
||||
raise "#{path_str.inspect} is a directory" unless dirent.file?
|
||||
io = dirent.io
|
||||
if block_given?
|
||||
yield io
|
||||
else
|
||||
io
|
||||
end
|
||||
end
|
||||
|
||||
alias new :open
|
||||
|
||||
def read path
|
||||
open(path) { |f| f.read }
|
||||
end
|
||||
|
||||
# crappy copy from Dir.
|
||||
def unlink path
|
||||
dirent = @ole.dirent_from_path path
|
||||
# EPERM
|
||||
raise "operation not permitted #{path.inspect}" unless dirent.file?
|
||||
# i think we should free all of our blocks. i think the best way to do that would be
|
||||
# like:
|
||||
# open(path) { |f| f.truncate 0 }. which should free all our blocks from the
|
||||
# allocation table. then if we remove ourself from our parent, we won't be part of
|
||||
# the bat at save time.
|
||||
# i think if you run repack, all free blocks should get zeroed.
|
||||
parent = @ole.dirent_from_path(('/' + path).sub(/\/[^\/]+$/, ''))
|
||||
parent.children.delete dirent
|
||||
1 # hmmm. as per ::File ?
|
||||
end
|
||||
end
|
||||
|
||||
class DirParent
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
end
|
||||
|
||||
def open path_str
|
||||
dirent = @ole.dirent_from_path path_str
|
||||
# like Errno::ENOTDIR
|
||||
raise "#{path_str.inspect} is not a directory" unless dirent.dir?
|
||||
dir = Dir.new dirent, path_str
|
||||
if block_given?
|
||||
yield dir
|
||||
else
|
||||
dir
|
||||
end
|
||||
end
|
||||
|
||||
# certain Dir class methods proxy in this fashion:
|
||||
def entries path
|
||||
open(path) { |dir| dir.entries }
|
||||
end
|
||||
|
||||
# there are some other important ones, like:
|
||||
# chroot (!), mkdir, chdir, rmdir, glob etc etc. for now, i think
|
||||
# mkdir, and rmdir are the main ones we'd need to support
|
||||
def rmdir path
|
||||
dirent = @ole.dirent_from_path path
|
||||
|
||||
# repeating myself
|
||||
raise "#{path.inspect} is not a directory" unless dirent.dir?
|
||||
# ENOTEMPTY:
|
||||
raise "directory not empty #{path.inspect}" unless dirent.children.empty?
|
||||
# now delete it, how to do that? the canonical representation that is
|
||||
# maintained is the root tree, and the children array. we must remove it
|
||||
# from the children array.
|
||||
# we need the parent then. this sucks but anyway:
|
||||
parent = @ole.dirent_from_path path.sub(/\/[^\/]+$/, '') || '/'
|
||||
# note that the way this currently works, on save and repack time this will get
|
||||
# reflected. to work properly, ie to make a difference now it would have to re-write
|
||||
# the dirent. i think that Ole::Storage#close will handle that. and maybe include a
|
||||
# #repack.
|
||||
parent.children.delete dirent
|
||||
0 # hmmm. as per ::Dir ?
|
||||
end
|
||||
|
||||
class Dir
|
||||
include Enumerable
|
||||
attr_reader :dirent, :path, :entries, :pos
|
||||
|
||||
def initialize dirent, path
|
||||
@dirent, @path = dirent, path
|
||||
@pos = 0
|
||||
# FIXME: hack, and probably not really desired
|
||||
@entries = %w[. ..] + @dirent.children.map(&:name)
|
||||
end
|
||||
|
||||
def each(&block)
|
||||
@entries.each(&block)
|
||||
end
|
||||
|
||||
def close
|
||||
end
|
||||
|
||||
def read
|
||||
@entries[@pos]
|
||||
ensure
|
||||
@pos += 1 if @pos < @entries.length
|
||||
end
|
||||
|
||||
def pos= pos
|
||||
@pos = [[0, pos].max, @entries.length].min
|
||||
end
|
||||
|
||||
def rewind
|
||||
@pos = 0
|
||||
end
|
||||
|
||||
alias tell :pos
|
||||
alias seek :pos=
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
# keeping this file around for now, but will delete later on...
|
||||
require 'ole/storage/file_system'
|
||||
|
|
|
@ -0,0 +1,231 @@
|
|||
# need IO::Mode
|
||||
require 'ole/support'
|
||||
|
||||
#
|
||||
# = Introduction
|
||||
#
|
||||
# +RangesIO+ is a basic class for wrapping another IO object allowing you to arbitrarily reorder
|
||||
# slices of the input file by providing a list of ranges. Intended as an initial measure to curb
|
||||
# inefficiencies in the Dirent#data method just reading all of a file's data in one hit, with
|
||||
# no method to stream it.
|
||||
#
|
||||
# This class will encapuslate the ranges (corresponding to big or small blocks) of any ole file
|
||||
# and thus allow reading/writing directly to the source bytes, in a streamed fashion (so just
|
||||
# getting 16 bytes doesn't read the whole thing).
|
||||
#
|
||||
# In the simplest case it can be used with a single range to provide a limited io to a section of
|
||||
# a file.
|
||||
#
|
||||
# = Limitations
|
||||
#
|
||||
# * No buffering. by design at the moment. Intended for large reads
|
||||
#
|
||||
# = TODO
|
||||
#
|
||||
# On further reflection, this class is something of a joining/optimization of
|
||||
# two separate IO classes. a SubfileIO, for providing access to a range within
|
||||
# a File as a separate IO object, and a ConcatIO, allowing the presentation of
|
||||
# a bunch of io objects as a single unified whole.
|
||||
#
|
||||
# I will need such a ConcatIO if I'm to provide Mime#to_io, a method that will
|
||||
# convert a whole mime message into an IO stream, that can be read from.
|
||||
# It will just be the concatenation of a series of IO objects, corresponding to
|
||||
# headers and boundaries, as StringIO's, and SubfileIO objects, coming from the
|
||||
# original message proper, or RangesIO as provided by the Attachment#data, that
|
||||
# will then get wrapped by Mime in a Base64IO or similar, to get encoded on-the-
|
||||
# fly. Thus the attachment, in its plain or encoded form, and the message as a
|
||||
# whole never exists as a single string in memory, as it does now. This is a
|
||||
# fair bit of work to achieve, but generally useful I believe.
|
||||
#
|
||||
# This class isn't ole specific, maybe move it to my general ruby stream project.
|
||||
#
|
||||
class RangesIO
|
||||
attr_reader :io, :mode, :ranges, :size, :pos
|
||||
# +io+:: the parent io object that we are wrapping.
|
||||
# +mode+:: the mode to use
|
||||
# +params+:: hash of params.
|
||||
# * :ranges - byte offsets, either:
|
||||
# 1. an array of ranges [1..2, 4..5, 6..8] or
|
||||
# 2. an array of arrays, where the second is length [[1, 1], [4, 1], [6, 2]] for the above
|
||||
# (think the way String indexing works)
|
||||
# * :close_parent - boolean to close parent when this object is closed
|
||||
#
|
||||
# NOTE: the +ranges+ can overlap.
|
||||
def initialize io, mode='r', params={}
|
||||
mode, params = 'r', mode if Hash === mode
|
||||
ranges = params[:ranges]
|
||||
@params = {:close_parent => false}.merge params
|
||||
@mode = IO::Mode.new mode
|
||||
@io = io
|
||||
# convert ranges to arrays. check for negative ranges?
|
||||
ranges ||= [0, io.size]
|
||||
@ranges = ranges.map { |r| Range === r ? [r.begin, r.end - r.begin] : r }
|
||||
# calculate size
|
||||
@size = @ranges.inject(0) { |total, (pos, len)| total + len }
|
||||
# initial position in the file
|
||||
@pos = 0
|
||||
|
||||
# handle some mode flags
|
||||
truncate 0 if @mode.truncate?
|
||||
seek size if @mode.append?
|
||||
end
|
||||
|
||||
#IOError: closed stream
|
||||
# get this for reading, writing, everything...
|
||||
#IOError: not opened for writing
|
||||
|
||||
# add block form. TODO add test for this
|
||||
def self.open(*args, &block)
|
||||
ranges_io = new(*args)
|
||||
if block_given?
|
||||
begin; yield ranges_io
|
||||
ensure; ranges_io.close
|
||||
end
|
||||
else
|
||||
ranges_io
|
||||
end
|
||||
end
|
||||
|
||||
def pos= pos, whence=IO::SEEK_SET
|
||||
case whence
|
||||
when IO::SEEK_SET
|
||||
when IO::SEEK_CUR
|
||||
pos += @pos
|
||||
when IO::SEEK_END
|
||||
pos = @size + pos
|
||||
else raise Errno::EINVAL
|
||||
end
|
||||
raise Errno::EINVAL unless (0...@size) === pos
|
||||
@pos = pos
|
||||
end
|
||||
|
||||
alias seek :pos=
|
||||
alias tell :pos
|
||||
|
||||
def close
|
||||
@io.close if @params[:close_parent]
|
||||
end
|
||||
|
||||
# returns the [+offset+, +size+], pair inorder to read/write at +pos+
|
||||
# (like a partial range), and its index.
|
||||
def offset_and_size pos
|
||||
total = 0
|
||||
ranges.each_with_index do |(offset, size), i|
|
||||
if pos <= total + size
|
||||
diff = pos - total
|
||||
return [offset + diff, size - diff], i
|
||||
end
|
||||
total += size
|
||||
end
|
||||
# should be impossible for any valid pos, (0...size) === pos
|
||||
raise ArgumentError, "no range for pos #{pos.inspect}"
|
||||
end
|
||||
|
||||
def eof?
|
||||
@pos == @size
|
||||
end
|
||||
|
||||
# read bytes from file, to a maximum of +limit+, or all available if unspecified.
|
||||
def read limit=nil
|
||||
data = ''
|
||||
return data if eof?
|
||||
limit ||= size
|
||||
partial_range, i = offset_and_size @pos
|
||||
# this may be conceptually nice (create sub-range starting where we are), but
|
||||
# for a large range array its pretty wasteful. even the previous way was. but
|
||||
# i'm not trying to optimize this atm. it may even go to c later if necessary.
|
||||
([partial_range] + ranges[i+1..-1]).each do |pos, len|
|
||||
@io.seek pos
|
||||
if limit < len
|
||||
# convoluted, to handle read errors. s may be nil
|
||||
s = @io.read limit
|
||||
@pos += s.length if s
|
||||
break data << s
|
||||
end
|
||||
# convoluted, to handle ranges beyond the size of the file
|
||||
s = @io.read len
|
||||
@pos += s.length if s
|
||||
data << s
|
||||
break if s.length != len
|
||||
limit -= len
|
||||
end
|
||||
data
|
||||
end
|
||||
|
||||
# you may override this call to update @ranges and @size, if applicable.
|
||||
def truncate size
|
||||
raise NotImplementedError, 'truncate not supported'
|
||||
end
|
||||
|
||||
# using explicit forward instead of an alias now for overriding.
|
||||
# should override truncate.
|
||||
def size= size
|
||||
truncate size
|
||||
end
|
||||
|
||||
def write data
|
||||
# short cut. needed because truncate 0 may return no ranges, instead of empty range,
|
||||
# thus offset_and_size fails.
|
||||
return 0 if data.empty?
|
||||
data_pos = 0
|
||||
# if we don't have room, we can use the truncate hook to make more space.
|
||||
if data.length > @size - @pos
|
||||
begin
|
||||
truncate @pos + data.length
|
||||
rescue NotImplementedError
|
||||
raise IOError, "unable to grow #{inspect} to write #{data.length} bytes"
|
||||
end
|
||||
end
|
||||
partial_range, i = offset_and_size @pos
|
||||
([partial_range] + ranges[i+1..-1]).each do |pos, len|
|
||||
@io.seek pos
|
||||
if data_pos + len > data.length
|
||||
chunk = data[data_pos..-1]
|
||||
@io.write chunk
|
||||
@pos += chunk.length
|
||||
data_pos = data.length
|
||||
break
|
||||
end
|
||||
@io.write data[data_pos, len]
|
||||
@pos += len
|
||||
data_pos += len
|
||||
end
|
||||
data_pos
|
||||
end
|
||||
|
||||
alias << write
|
||||
|
||||
# i can wrap it in a buffered io stream that
|
||||
# provides gets, and appropriately handle pos,
|
||||
# truncate. mostly added just to past the tests.
|
||||
# FIXME
|
||||
def gets
|
||||
s = read 1024
|
||||
i = s.index "\n"
|
||||
@pos -= s.length - (i+1)
|
||||
s[0..i]
|
||||
end
|
||||
alias readline :gets
|
||||
|
||||
def inspect
|
||||
# the rescue is for empty files
|
||||
pos, len = (@ranges[offset_and_size(@pos).last] rescue [nil, nil])
|
||||
range_str = pos ? "#{pos}..#{pos+len}" : 'nil'
|
||||
"#<#{self.class} io=#{io.inspect}, size=#@size, pos=#@pos, "\
|
||||
"range=#{range_str}>"
|
||||
end
|
||||
end
|
||||
|
||||
# this subclass of ranges io explicitly ignores the truncate part of 'w' modes.
|
||||
# only really needed for the allocation table writes etc. maybe just use explicit modes
|
||||
# for those
|
||||
# better yet write a test that breaks before I fix it. added nodoc for the
|
||||
# time being.
|
||||
class RangesIONonResizeable < RangesIO # :nodoc:
|
||||
def initialize io, mode='r', params={}
|
||||
mode, params = 'r', mode if Hash === mode
|
||||
flags = IO::Mode.new(mode).flags & ~IO::TRUNC
|
||||
super io, flags, params
|
||||
end
|
||||
end
|
||||
|
|
@ -1,934 +1,3 @@
|
|||
#! /usr/bin/ruby -w
|
||||
|
||||
require 'iconv'
|
||||
require 'date'
|
||||
require 'stringio'
|
||||
require 'tempfile'
|
||||
|
||||
|
||||
require 'ole/base'
|
||||
require 'ole/types'
|
||||
require 'ole/io_helpers'
|
||||
|
||||
module Ole # :nodoc:
|
||||
|
||||
#
|
||||
# = Introduction
|
||||
#
|
||||
# <tt>Ole::Storage</tt> is a simple class intended to abstract away details of the
|
||||
# access to OLE2 structured storage files, such as those produced by
|
||||
# Microsoft Office, eg *.doc, *.msg etc.
|
||||
#
|
||||
# Initially based on chicago's libole, source available at
|
||||
# http://prdownloads.sf.net/chicago/ole.tgz
|
||||
# Later augmented with some corrections by inspecting pole, and (purely
|
||||
# for header definitions) gsf.
|
||||
#
|
||||
# = Usage
|
||||
#
|
||||
# Usage should be fairly straight forward:
|
||||
#
|
||||
# # get the parent ole storage object
|
||||
# ole = Ole::Storage.open 'myfile.msg', 'r+'
|
||||
# # => #<Ole::Storage io=#<File:myfile.msg> root=#<Dirent:"Root Entry">>
|
||||
# # read some data
|
||||
# ole.root[1].read 4
|
||||
# # => "\001\000\376\377"
|
||||
# # get the top level root object and output a tree structure for
|
||||
# # debugging
|
||||
# puts ole.root.to_tree
|
||||
# # =>
|
||||
# - #<Dirent:"Root Entry" size=3840 time="2006-11-03T00:52:53Z">
|
||||
# |- #<Dirent:"__nameid_version1.0" size=0 time="2006-11-03T00:52:53Z">
|
||||
# | |- #<Dirent:"__substg1.0_00020102" size=16 data="CCAGAAAAAADAAA...">
|
||||
# ...
|
||||
# |- #<Dirent:"__substg1.0_8002001E" size=4 data="MTEuMA==">
|
||||
# |- #<Dirent:"__properties_version1.0" size=800 data="AAAAAAAAAAABAA...">
|
||||
# \- #<Dirent:"__recip_version1.0_#00000000" size=0 time="2006-11-03T00:52:53Z">
|
||||
# |- #<Dirent:"__substg1.0_0FF60102" size=4 data="AAAAAA==">
|
||||
# ...
|
||||
# # write some data, and finish up (note that open is 'r+', so this overwrites
|
||||
# # but doesn't truncate)
|
||||
# ole.root["\001CompObj"].open { |f| f.write "blah blah" }
|
||||
# ole.close
|
||||
#
|
||||
# = TODO
|
||||
#
|
||||
# 1. tests. lock down how things work at the moment - mostly good.
|
||||
# create from scratch works now, as does copying in a subtree of another doc, so
|
||||
# ole embedded attachment serialization works now. i can save embedded xls in an msg
|
||||
# into a separate file, and open it. this was a goal. now i would want to implemenet
|
||||
# to_mime conversion for embedded attachments, that serializes them to ole, but handles
|
||||
# some separately like various meta file types as plain .wmf attachments perhaps. this
|
||||
# will give pretty good .eml's from emails with embedded attachments.
|
||||
# the other todo is .rtf output, with full support for embedded ole objects...
|
||||
# 2. lots of tidying up
|
||||
# - main FIXME's in this regard are:
|
||||
# * the custom header cruft for Header and Dirent needs some love.
|
||||
# * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
|
||||
# and, in a manner of speaking, but arguably different, Storage itself.
|
||||
# they have differing api's which would be nice to clean.
|
||||
# AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
|
||||
# * ole types need work, can't serialize datetime at the moment.
|
||||
# 3. need to fix META_BAT support in #flush.
|
||||
#
|
||||
class Storage
|
||||
VERSION = '1.1.1'
|
||||
|
||||
# The top of the ole tree structure
|
||||
attr_reader :root
|
||||
# The tree structure in its original flattened form. only valid after #load, or #flush.
|
||||
attr_reader :dirents
|
||||
# The underlying io object to/from which the ole object is serialized, whether we
|
||||
# should close it, and whether it is writeable
|
||||
attr_reader :io, :close_parent, :writeable
|
||||
# Low level internals, you probably shouldn't need to mess with these
|
||||
attr_reader :header, :bbat, :sbat, :sb_file
|
||||
|
||||
# maybe include an option hash, and allow :close_parent => true, to be more general.
|
||||
# +arg+ should be either a file, or an +IO+ object, and needs to be seekable.
|
||||
def initialize arg, mode=nil
|
||||
# get the io object
|
||||
@close_parent, @io = if String === arg
|
||||
[true, open(arg, mode || 'rb')]
|
||||
else
|
||||
raise 'unable to specify mode string with io object' if mode
|
||||
[false, arg]
|
||||
end
|
||||
# do we have this file opened for writing? don't know of a better way to tell
|
||||
@writeable = begin
|
||||
@io.flush
|
||||
true
|
||||
rescue IOError
|
||||
false
|
||||
end
|
||||
# silence undefined warning in clear
|
||||
@sb_file = nil
|
||||
# if the io object has data, we should load it, otherwise start afresh
|
||||
if @io.size > 0; load
|
||||
else clear
|
||||
end
|
||||
end
|
||||
|
||||
def self.new arg, mode=nil
|
||||
ole = super
|
||||
if block_given?
|
||||
begin yield ole
|
||||
ensure; ole.close
|
||||
end
|
||||
else ole
|
||||
end
|
||||
end
|
||||
|
||||
class << self
|
||||
# encouraged
|
||||
alias open :new
|
||||
# deprecated
|
||||
alias load :new
|
||||
end
|
||||
|
||||
# load document from file.
|
||||
def load
|
||||
# we always read 512 for the header block. if the block size ends up being different,
|
||||
# what happens to the 109 fat entries. are there more/less entries?
|
||||
@io.rewind
|
||||
header_block = @io.read 512
|
||||
@header = Header.load header_block
|
||||
|
||||
# create an empty bbat
|
||||
@bbat = AllocationTable::Big.new self
|
||||
# extra mbat blocks
|
||||
mbat_blocks = (0...@header.num_mbat).map { |i| i + @header.mbat_start }
|
||||
bbat_chain = (header_block[Header::SIZE..-1] + @bbat.read(mbat_blocks)).unpack 'L*'
|
||||
# am i using num_bat in the right way?
|
||||
@bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
|
||||
|
||||
# get block chain for directories, read it, then split it into chunks and load the
|
||||
# directory entries. semantics changed - used to cut at first dir where dir.type == 0
|
||||
@dirents = @bbat.read(@header.dirent_start).scan(/.{#{Dirent::SIZE}}/mo).
|
||||
map { |str| Dirent.load self, str }.reject { |d| d.type_id == 0 }
|
||||
|
||||
# now reorder from flat into a tree
|
||||
# links are stored in some kind of balanced binary tree
|
||||
# check that everything is visited at least, and at most once
|
||||
# similarly with the blocks of the file.
|
||||
# was thinking of moving this to Dirent.to_tree instead.
|
||||
class << @dirents
|
||||
def to_tree idx=0
|
||||
return [] if idx == Dirent::EOT
|
||||
d = self[idx]
|
||||
d.children = to_tree d.child
|
||||
raise "directory #{d.inspect} used twice" if d.idx
|
||||
d.idx = idx
|
||||
to_tree(d.prev) + [d] + to_tree(d.next)
|
||||
end
|
||||
end
|
||||
|
||||
@root = @dirents.to_tree.first
|
||||
Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
|
||||
unused = @dirents.reject(&:idx).length
|
||||
Log.warn "* #{unused} unused directories" if unused > 0
|
||||
|
||||
# FIXME i don't currently use @header.num_sbat which i should
|
||||
# hmm. nor do i write it. it means what exactly again?
|
||||
@sb_file = RangesIOResizeable.new @bbat, @root.first_block, @root.size
|
||||
@sbat = AllocationTable::Small.new self
|
||||
@sbat.load @bbat.read(@header.sbat_start)
|
||||
end
|
||||
|
||||
def close
|
||||
flush if @writeable
|
||||
@sb_file.close
|
||||
@io.close if @close_parent
|
||||
end
|
||||
|
||||
# should have a #open_dirent i think. and use it in load and flush. neater.
|
||||
# also was thinking about Dirent#open_padding. then i can more easily clean up the padding
|
||||
# to be 0.chr
|
||||
=begin
|
||||
thoughts on fixes:
|
||||
1. reterminate any chain not ending in EOC.
|
||||
2. pass through all chain heads looking for collisions, and making sure nothing points to them
|
||||
(ie they are really heads).
|
||||
3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
|
||||
in the bat for them.
|
||||
this stuff will ensure reliability of input better. otherwise, its actually worth doing a repack
|
||||
directly after read, to ensure the above is probably acounted for, before subsequent writes possibly
|
||||
destroy things.
|
||||
=end
|
||||
def flush
|
||||
# recreate dirs from our tree, split into dirs and big and small files
|
||||
@root.type = :root
|
||||
# for now.
|
||||
@root.name = 'Root Entry'
|
||||
@root.first_block = @sb_file.first_block
|
||||
@root.size = @sb_file.size
|
||||
@dirents = @root.flatten
|
||||
#dirs, files = @dirents.partition(&:dir?)
|
||||
#big_files, small_files = files.partition { |file| file.size > @header.threshold }
|
||||
|
||||
# maybe i should move the block form up to RangesIO, and get it for free at all levels.
|
||||
# Dirent#open gets block form for free then
|
||||
io = RangesIOResizeable.new @bbat, @header.dirent_start
|
||||
io.truncate 0
|
||||
@dirents.each { |dirent| io.write dirent.save }
|
||||
padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
|
||||
#p [:padding, padding]
|
||||
io.write 0.chr * padding
|
||||
@header.dirent_start = io.first_block
|
||||
io.close
|
||||
|
||||
# similarly for the sbat data.
|
||||
io = RangesIOResizeable.new @bbat, @header.sbat_start
|
||||
io.truncate 0
|
||||
io.write @sbat.save
|
||||
@header.sbat_start = io.first_block
|
||||
@header.num_sbat = @bbat.chain(@header.sbat_start).length
|
||||
io.close
|
||||
|
||||
# what follows will be slightly more complex for the bat fiddling.
|
||||
|
||||
# create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
|
||||
# truncate. then when its time to write, convert that chain and some chunk of blocks at
|
||||
# the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
|
||||
# done.
|
||||
|
||||
@bbat.table.map! do |b|
|
||||
b == AllocationTable::BAT || b == AllocationTable::META_BAT ?
|
||||
AllocationTable::AVAIL : b
|
||||
end
|
||||
io = RangesIOResizeable.new @bbat, AllocationTable::EOC
|
||||
# use crappy loop for now:
|
||||
while true
|
||||
bbat_data = @bbat.save
|
||||
#mbat_data = bbat_data.length / @bbat.block_size * 4
|
||||
mbat_chain = @bbat.chain io.first_block
|
||||
raise NotImplementedError, "don't handle writing out extra META_BAT blocks yet" if mbat_chain.length > 109
|
||||
# so we can ignore meta blocks in this calculation:
|
||||
break if io.size >= bbat_data.length # it shouldn't be bigger right?
|
||||
# this may grow the bbat, depending on existing available blocks
|
||||
io.truncate bbat_data.length
|
||||
end
|
||||
|
||||
# now extract the info we want:
|
||||
ranges = io.ranges
|
||||
mbat_chain = @bbat.chain io.first_block
|
||||
io.close
|
||||
mbat_chain.each { |b| @bbat.table[b] = AllocationTable::BAT }
|
||||
@header.num_bat = mbat_chain.length
|
||||
#p @bbat.truncated_table
|
||||
#p ranges
|
||||
#p mbat_chain
|
||||
# not resizeable!
|
||||
io = RangesIO.new @io, ranges
|
||||
io.write @bbat.save
|
||||
io.close
|
||||
mbat_chain += [AllocationTable::AVAIL] * (109 - mbat_chain.length)
|
||||
@header.mbat_start = AllocationTable::EOC
|
||||
@header.num_mbat = 0
|
||||
|
||||
=begin
|
||||
bbat_data = new_bbat.save
|
||||
# must exist as linear chain stored in header.
|
||||
@header.num_bat = (bbat_data.length / new_bbat.block_size.to_f).ceil
|
||||
base = io.pos / new_bbat.block_size - 1
|
||||
io.write bbat_data
|
||||
# now that spanned a number of blocks:
|
||||
mbat = (0...@header.num_bat).map { |i| i + base }
|
||||
mbat += [AllocationTable::AVAIL] * (109 - mbat.length) if mbat.length < 109
|
||||
header_mbat = mbat[0...109]
|
||||
other_mbat_data = mbat[109..-1].pack 'L*'
|
||||
@header.mbat_start = base + @header.num_bat
|
||||
@header.num_mbat = (other_mbat_data.length / new_bbat.block_size.to_f).ceil
|
||||
io.write other_mbat_data
|
||||
=end
|
||||
|
||||
@root.type = :dir
|
||||
|
||||
# now seek back and write the header out
|
||||
@io.seek 0
|
||||
@io.write @header.save + mbat_chain.pack('L*')
|
||||
@io.flush
|
||||
end
|
||||
|
||||
def clear
|
||||
# first step though is to support modifying pre-existing and saving, then this
|
||||
# missing gap will be fairly straight forward - essentially initialize to
|
||||
# equivalent of loading an empty ole document.
|
||||
#raise NotImplementedError, 'unable to create new ole objects from scratch as yet'
|
||||
Log.warn 'creating new ole storage object on non-writable io' unless @writeable
|
||||
@header = Header.new
|
||||
@bbat = AllocationTable::Big.new self
|
||||
@root = Dirent.new self, :dir
|
||||
@root.name = 'Root Entry'
|
||||
@dirents = [@root]
|
||||
@root.idx = 0
|
||||
@root.children = []
|
||||
# size shouldn't display for non-files
|
||||
@root.size = 0
|
||||
@sb_file.close if @sb_file
|
||||
@sb_file = RangesIOResizeable.new @bbat, AllocationTable::EOC
|
||||
@sbat = AllocationTable::Small.new self
|
||||
# throw everything else the hell away
|
||||
@io.truncate 0
|
||||
end
|
||||
|
||||
# could be useful with mis-behaving ole documents. or to just clean them up.
|
||||
def repack temp=:file
|
||||
case temp
|
||||
when :file; Tempfile.open 'w+', &method(:repack_using_io)
|
||||
when :mem; StringIO.open(&method(:repack_using_io))
|
||||
else raise "unknown temp backing #{temp.inspect}"
|
||||
end
|
||||
end
|
||||
|
||||
def repack_using_io temp_io
|
||||
@io.rewind
|
||||
IO.copy @io, temp_io
|
||||
clear
|
||||
Storage.open temp_io do |temp_ole|
|
||||
temp_ole.root.type = :dir
|
||||
Dirent.copy temp_ole.root, root
|
||||
end
|
||||
end
|
||||
|
||||
def bat_for_size size
|
||||
# note >=, not > previously.
|
||||
size >= @header.threshold ? @bbat : @sbat
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
|
||||
end
|
||||
|
||||
# A class which wraps the ole header
|
||||
class Header < Struct.new(
|
||||
:magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
|
||||
:reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
|
||||
:sbat_start, :num_sbat, :mbat_start, :num_mbat
|
||||
)
|
||||
PACK = 'a8 a16 S2 a2 S2 a6 L3 a4 L5'
|
||||
SIZE = 0x4c
|
||||
# i have seen it pointed out that the first 4 bytes of hex,
|
||||
# 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
|
||||
MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
|
||||
# what you get if creating new header from scratch.
|
||||
# AllocationTable::EOC isn't available yet. meh.
|
||||
EOC = 0xfffffffe
|
||||
DEFAULT = [
|
||||
MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
|
||||
0.chr * 6, 0, 1, EOC, 0.chr * 4,
|
||||
4096, EOC, 0, EOC, 0
|
||||
]
|
||||
|
||||
# 2 basic initializations, from scratch, or from a data string.
|
||||
# from scratch will be geared towards creating a new ole object
|
||||
def initialize *values
|
||||
super(*(values.empty? ? DEFAULT : values))
|
||||
validate!
|
||||
end
|
||||
|
||||
def self.load str
|
||||
Header.new(*str.unpack(PACK))
|
||||
end
|
||||
|
||||
def save
|
||||
to_a.pack PACK
|
||||
end
|
||||
|
||||
def validate!
|
||||
raise "OLE2 signature is invalid" unless magic == MAGIC
|
||||
if num_bat == 0 or # is that valid for a completely empty file?
|
||||
# not sure about this one. basically to do max possible bat given size of mbat
|
||||
num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
|
||||
# shouldn't need to use the mbat as there is enough space in the header block
|
||||
num_bat < 109 && num_mbat != 0 or
|
||||
# given the size of the header is 76, if b_shift <= 6, blocks address the header.
|
||||
s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
|
||||
# we only handle little endian
|
||||
byte_order != "\xfe\xff"
|
||||
raise "not valid OLE2 structured storage file"
|
||||
end
|
||||
# relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
|
||||
# 3 for this value.
|
||||
# transacting_signature != "\x00" * 4 or
|
||||
if threshold != 4096 or
|
||||
num_mbat == 0 && mbat_start != AllocationTable::EOC or
|
||||
reserved != "\x00" * 6
|
||||
Log.warn "may not be a valid OLE2 structured storage file"
|
||||
end
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# +AllocationTable+'s hold the chains corresponding to files. Given
|
||||
# an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
|
||||
# the blocks that make up that file.
|
||||
#
|
||||
# There are 2 allocation tables, the bbat, and sbat, for big and small
|
||||
# blocks respectively. The block chain should be loaded using either
|
||||
# <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
|
||||
# as appropriate.
|
||||
#
|
||||
# Whether or not big or small blocks are used for a file depends on
|
||||
# whether its size is over the <tt>Header#threshold</tt> level.
|
||||
#
|
||||
# An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
|
||||
# which are stored in blocks throughout the file. The blocks are either
|
||||
# big or small, and are accessed using the <tt>AllocationTable</tt>.
|
||||
#
|
||||
# The bbat allocation table's data is stored in the spare room in the header
|
||||
# block, and in extra blocks throughout the file as referenced by the meta
|
||||
# bat. That chain is linear, as there is no higher level table.
|
||||
#
|
||||
class AllocationTable
|
||||
# a free block (I don't currently leave any blocks free), although I do pad out
|
||||
# the allocation table with AVAIL to the block size.
|
||||
AVAIL = 0xffffffff
|
||||
EOC = 0xfffffffe # end of a chain
|
||||
# these blocks correspond to the bat, and aren't part of a file, nor available.
|
||||
# (I don't currently output these)
|
||||
BAT = 0xfffffffd
|
||||
META_BAT = 0xfffffffc
|
||||
|
||||
attr_reader :ole, :io, :table, :block_size
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
@table = []
|
||||
end
|
||||
|
||||
def load data
|
||||
@table = data.unpack('L*')
|
||||
end
|
||||
|
||||
def truncated_table
|
||||
# this strips trailing AVAILs. come to think of it, this has the potential to break
|
||||
# bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
|
||||
# very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
|
||||
# at load time.
|
||||
temp = @table.reverse
|
||||
not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
|
||||
temp.reverse
|
||||
end
|
||||
|
||||
def save
|
||||
table = truncated_table #@table
|
||||
# pad it out some
|
||||
num = @ole.bbat.block_size / 4
|
||||
# do you really use AVAIL? they probably extend past end of file, and may shortly
|
||||
# be used for the bat. not really good.
|
||||
table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
|
||||
table.pack 'L*'
|
||||
end
|
||||
|
||||
# rewriting this to be non-recursive. it broke on a large attachment
|
||||
# building up the chain, causing a stack error. need tail-call elimination...
|
||||
def chain start
|
||||
a = []
|
||||
idx = start
|
||||
until idx >= META_BAT
|
||||
raise "broken allocationtable chain" if idx < 0 || idx > @table.length
|
||||
a << idx
|
||||
idx = @table[idx]
|
||||
end
|
||||
Log.warn "invalid chain terminator #{idx}" unless idx == EOC
|
||||
a
|
||||
end
|
||||
|
||||
def ranges chain, size=nil
|
||||
chain = self.chain(chain) unless Array === chain
|
||||
blocks_to_ranges chain, size
|
||||
end
|
||||
|
||||
# Turn a chain (an array given by +chain+) of big blocks, optionally
|
||||
# truncated to +size+, into an array of arrays describing the stretches of
|
||||
# bytes in the file that it belongs to.
|
||||
#
|
||||
# Big blocks are of size Ole::Storage::Header#b_size, and are stored
|
||||
# directly in the parent file.
|
||||
# truncate the chain if required
|
||||
# convert chain to ranges of the block size
|
||||
# truncate final range if required
|
||||
|
||||
def blocks_to_ranges chain, size=nil
|
||||
chain = chain[0...(size.to_f / block_size).ceil] if size
|
||||
ranges = chain.map { |i| [block_size * i, block_size] }
|
||||
ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
|
||||
ranges
|
||||
end
|
||||
|
||||
# quick shortcut. chain can be either a head (in which case the table is used to
|
||||
# turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
|
||||
# its not resizeable or migrateable. it probably could be resizeable though, using
|
||||
# self as the bat. but what would the first_block be?
|
||||
def open chain, size=nil
|
||||
io = RangesIO.new @io, ranges(chain, size)
|
||||
if block_given?
|
||||
begin yield io
|
||||
ensure; io.close
|
||||
end
|
||||
else io
|
||||
end
|
||||
end
|
||||
|
||||
def read chain, size=nil
|
||||
open chain, size, &:read
|
||||
end
|
||||
|
||||
# ----------------------
|
||||
|
||||
def get_free_block
|
||||
@table.each_index { |i| return i if @table[i] == AVAIL }
|
||||
@table.push AVAIL
|
||||
@table.length - 1
|
||||
end
|
||||
|
||||
# must return first_block
|
||||
def resize_chain first_block, size
|
||||
new_num_blocks = (size / block_size.to_f).ceil
|
||||
blocks = chain first_block
|
||||
old_num_blocks = blocks.length
|
||||
if new_num_blocks < old_num_blocks
|
||||
# de-allocate some of our old blocks. TODO maybe zero them out in the file???
|
||||
(new_num_blocks...old_num_blocks).each { |i| @table[blocks[i]] = AVAIL }
|
||||
# if we have a chain, terminate it and return head, otherwise return EOC
|
||||
if new_num_blocks > 0
|
||||
@table[blocks[new_num_blocks-1]] = EOC
|
||||
first_block
|
||||
else EOC
|
||||
end
|
||||
elsif new_num_blocks > old_num_blocks
|
||||
# need some more blocks.
|
||||
last_block = blocks.last
|
||||
(new_num_blocks - old_num_blocks).times do
|
||||
block = get_free_block
|
||||
# connect the chain. handle corner case of blocks being [] initially
|
||||
if last_block
|
||||
@table[last_block] = block
|
||||
else
|
||||
first_block = block
|
||||
end
|
||||
last_block = block
|
||||
# this is just to inhibit the problem where it gets picked as being a free block
|
||||
# again next time around.
|
||||
@table[last_block] = EOC
|
||||
end
|
||||
first_block
|
||||
else first_block
|
||||
end
|
||||
end
|
||||
|
||||
class Big < AllocationTable
|
||||
def initialize(*args)
|
||||
super
|
||||
@block_size = 1 << @ole.header.b_shift
|
||||
@io = @ole.io
|
||||
end
|
||||
|
||||
# Big blocks are kind of -1 based, in order to not clash with the header.
|
||||
def blocks_to_ranges blocks, size
|
||||
super blocks.map { |b| b + 1 }, size
|
||||
end
|
||||
end
|
||||
|
||||
class Small < AllocationTable
|
||||
def initialize(*args)
|
||||
super
|
||||
@block_size = 1 << @ole.header.s_shift
|
||||
@io = @ole.sb_file
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
|
||||
# AllocationTable, and can be resized. used for read/write to 2 streams:
|
||||
# 1. serialized dirent data
|
||||
# 2. sbat table data
|
||||
# 3. all dirents but through RangesIOMigrateable below
|
||||
#
|
||||
# Note that all internal access to first_block is through accessors, as it is sometimes
|
||||
# useful to redirect it.
|
||||
class RangesIOResizeable < RangesIO
|
||||
attr_reader :bat
|
||||
attr_accessor :first_block
|
||||
def initialize bat, first_block, size=nil
|
||||
@bat = bat
|
||||
self.first_block = first_block
|
||||
super @bat.io, @bat.ranges(first_block, size)
|
||||
end
|
||||
|
||||
def truncate size
|
||||
# note that old_blocks is != @ranges.length necessarily. i'm planning to write a
|
||||
# merge_ranges function that merges sequential ranges into one as an optimization.
|
||||
self.first_block = @bat.resize_chain first_block, size
|
||||
@ranges = @bat.ranges first_block, size
|
||||
@pos = @size if @pos > size
|
||||
|
||||
# don't know if this is required, but we explicitly request our @io to grow if necessary
|
||||
# we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
|
||||
# can be made.
|
||||
# maybe its ok to just seek out there later??
|
||||
max = @ranges.map { |pos, len| pos + len }.max || 0
|
||||
@io.truncate max if max > @io.size
|
||||
|
||||
@size = size
|
||||
end
|
||||
end
|
||||
|
||||
# like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
|
||||
# between bats based on size, and updating the dirent, instead of the ole copy back
|
||||
# on close.
|
||||
class RangesIOMigrateable < RangesIOResizeable
|
||||
attr_reader :dirent
|
||||
def initialize dirent
|
||||
@dirent = dirent
|
||||
super @dirent.ole.bat_for_size(@dirent.size), @dirent.first_block, @dirent.size
|
||||
end
|
||||
|
||||
def truncate size
|
||||
bat = @dirent.ole.bat_for_size size
|
||||
if bat != @bat
|
||||
# bat migration needed! we need to backup some data. the amount of data
|
||||
# should be <= @ole.header.threshold, so we can just hold it all in one buffer.
|
||||
# backup this
|
||||
pos = @pos
|
||||
@pos = 0
|
||||
keep = read [@size, size].min
|
||||
# this does a normal truncate to 0, removing our presence from the old bat, and
|
||||
# rewrite the dirent's first_block
|
||||
super 0
|
||||
@bat = bat
|
||||
# just change the underlying io from right under everyone :)
|
||||
@io = bat.io
|
||||
# important to do this now, before the write. as the below write will always
|
||||
# migrate us back to sbat! this will now allocate us +size+ in the new bat.
|
||||
super
|
||||
@pos = 0
|
||||
write keep
|
||||
@pos = pos
|
||||
else
|
||||
super
|
||||
end
|
||||
# now just update the file
|
||||
@dirent.size = size
|
||||
end
|
||||
|
||||
# forward this to the dirent
|
||||
def first_block
|
||||
@dirent.first_block
|
||||
end
|
||||
|
||||
def first_block= val
|
||||
@dirent.first_block = val
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# A class which wraps an ole directory entry. Can be either a directory
|
||||
# (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
|
||||
#
|
||||
# Most interaction with <tt>Ole::Storage</tt> is through this class.
|
||||
# The 2 most important functions are <tt>Dirent#children</tt>, and
|
||||
# <tt>Dirent#data</tt>.
|
||||
#
|
||||
# was considering separate classes for dirs and files. some methods/attrs only
|
||||
# applicable to one or the other.
|
||||
class Dirent
|
||||
MEMBERS = [
|
||||
:name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
|
||||
:clsid, :flags, # dirs only
|
||||
:create_time_str, :modify_time_str, # files only
|
||||
:first_block, :size, :reserved
|
||||
]
|
||||
PACK = 'a64 S C C L3 a16 L a8 a8 L2 a4'
|
||||
SIZE = 128
|
||||
EPOCH = DateTime.parse '1601-01-01'
|
||||
TYPE_MAP = {
|
||||
# this is temporary
|
||||
0 => :empty,
|
||||
1 => :dir,
|
||||
2 => :file,
|
||||
5 => :root
|
||||
}
|
||||
COLOUR_MAP = {
|
||||
0 => :red,
|
||||
1 => :black
|
||||
}
|
||||
# used in the next / prev / child stuff to show that the tree ends here.
|
||||
# also used for first_block for directory.
|
||||
EOT = 0xffffffff
|
||||
# All +Dirent+ names are in UTF16, which we convert
|
||||
FROM_UTF16 = Iconv.new 'utf-8', 'utf-16le'
|
||||
TO_UTF16 = Iconv.new 'utf-16le', 'utf-8'
|
||||
|
||||
include Enumerable
|
||||
|
||||
attr_accessor :values
|
||||
|
||||
# Dirent's should be created in 1 of 2 ways, either Dirent.new ole, [:dir/:file/:root],
|
||||
# or Dirent.load '... dirent data ...'
|
||||
# its a bit clunky, but thats how it is at the moment. you can assign to type, but
|
||||
# shouldn't.
|
||||
|
||||
attr_accessor :idx
|
||||
# This returns all the children of this +Dirent+. It is filled in
|
||||
# when the tree structure is recreated.
|
||||
attr_accessor :children
|
||||
attr_reader :ole, :type, :create_time, :modify_time, :name
|
||||
def initialize ole, type
|
||||
@ole = ole
|
||||
# this isn't really good enough. need default values put in there.
|
||||
@values = [
|
||||
0.chr * 2, 2, 0, # will get overwritten
|
||||
1, EOT, EOT, EOT,
|
||||
0.chr * 16, 0, nil, nil,
|
||||
AllocationTable::EOC, 0, 0.chr * 4]
|
||||
# maybe check types here.
|
||||
@type = type
|
||||
@create_time = @modify_time = nil
|
||||
@children = []
|
||||
if file?
|
||||
@create_time = Time.now
|
||||
@modify_time = Time.now
|
||||
end
|
||||
end
|
||||
|
||||
def self.load ole, str
|
||||
# load should function without the need for the initializer.
|
||||
dirent = Dirent.allocate
|
||||
dirent.load ole, str
|
||||
dirent
|
||||
end
|
||||
|
||||
def load ole, str
|
||||
@ole = ole
|
||||
@values = str.unpack PACK
|
||||
@name = FROM_UTF16.iconv name_utf16[0...name_len].sub(/\x00\x00$/, '')
|
||||
@type = TYPE_MAP[type_id] or raise "unknown type #{type_id.inspect}"
|
||||
if file?
|
||||
@create_time = Types.load_time create_time_str
|
||||
@modify_time = Types.load_time modify_time_str
|
||||
end
|
||||
end
|
||||
|
||||
# only defined for files really. and the above children stuff is only for children.
|
||||
# maybe i should have some sort of File and Dir class, that subclass Dirents? a dirent
|
||||
# is just a data holder.
|
||||
# this can be used for write support if the underlying io object was opened for writing.
|
||||
# maybe take a mode string argument, and do truncation, append etc stuff.
|
||||
def open
|
||||
return nil unless file?
|
||||
io = RangesIOMigrateable.new self
|
||||
if block_given?
|
||||
begin yield io
|
||||
ensure; io.close
|
||||
end
|
||||
else io
|
||||
end
|
||||
end
|
||||
|
||||
def read limit=nil
|
||||
open { |io| io.read limit }
|
||||
end
|
||||
|
||||
def dir?
|
||||
# to count root as a dir.
|
||||
type != :file
|
||||
end
|
||||
|
||||
def file?
|
||||
type == :file
|
||||
end
|
||||
|
||||
def time
|
||||
# time is nil for streams, otherwise try to parse either of the time pairse (not
|
||||
# sure of their meaning - created / modified?)
|
||||
#@time ||= file? ? nil : (Dirent.parse_time(secs1, days1) || Dirent.parse_time(secs2, days2))
|
||||
create_time || modify_time
|
||||
end
|
||||
|
||||
def each(&block)
|
||||
@children.each(&block)
|
||||
end
|
||||
|
||||
def [] idx
|
||||
return children[idx] if Integer === idx
|
||||
# path style look up.
|
||||
# maybe take another arg to allow creation? or leave that to the filesystem
|
||||
# add on.
|
||||
# not sure if '/' is a valid char in an Dirent#name, so no splitting etc at
|
||||
# this level.
|
||||
# also what about warning about multiple hits for the same name?
|
||||
children.find { |child| idx === child.name }
|
||||
end
|
||||
|
||||
# solution for the above '/' thing for now.
|
||||
def / path
|
||||
self[path]
|
||||
end
|
||||
|
||||
def to_tree
|
||||
if children and !children.empty?
|
||||
str = "- #{inspect}\n"
|
||||
children.each_with_index do |child, i|
|
||||
last = i == children.length - 1
|
||||
child.to_tree.split(/\n/).each_with_index do |line, j|
|
||||
str << " #{last ? (j == 0 ? "\\" : ' ') : '|'}#{line}\n"
|
||||
end
|
||||
end
|
||||
str
|
||||
else "- #{inspect}\n"
|
||||
end
|
||||
end
|
||||
|
||||
MEMBERS.each_with_index do |sym, i|
|
||||
define_method(sym) { @values[i] }
|
||||
define_method(sym.to_s + '=') { |val| @values[i] = val }
|
||||
end
|
||||
|
||||
def to_a
|
||||
@values
|
||||
end
|
||||
|
||||
# flattens the tree starting from here into +dirents+. note it modifies its argument.
|
||||
def flatten dirents=[]
|
||||
@idx = dirents.length
|
||||
dirents << self
|
||||
children.each { |child| child.flatten dirents }
|
||||
self.child = Dirent.flatten_helper children
|
||||
dirents
|
||||
end
|
||||
|
||||
# i think making the tree structure optimized is actually more complex than this, and
|
||||
# requires some intelligent ordering of the children based on names, but as long as
|
||||
# it is valid its ok.
|
||||
# actually, i think its ok. gsf for example only outputs a singly-linked-list, where
|
||||
# prev is always EOT.
|
||||
def self.flatten_helper children
|
||||
return EOT if children.empty?
|
||||
i = children.length / 2
|
||||
this = children[i]
|
||||
this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
|
||||
this.idx
|
||||
end
|
||||
|
||||
attr_accessor :name, :type
|
||||
def save
|
||||
tmp = TO_UTF16.iconv(name)
|
||||
tmp = tmp[0, 62] if tmp.length > 62
|
||||
tmp += 0.chr * 2
|
||||
self.name_len = tmp.length
|
||||
self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
|
||||
begin
|
||||
self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
|
||||
rescue
|
||||
raise "unknown type #{type.inspect}"
|
||||
end
|
||||
# for the case of files, it is assumed that that was handled already
|
||||
# note not dir?, so as not to override root's first_block
|
||||
self.first_block = Dirent::EOT if type == :dir
|
||||
if 0 #file?
|
||||
#self.create_time_str = ?? #Types.load_time create_time_str
|
||||
#self.modify_time_str = ?? #Types.load_time modify_time_str
|
||||
else
|
||||
self.create_time_str = 0.chr * 8
|
||||
self.modify_time_str = 0.chr * 8
|
||||
end
|
||||
@values.pack PACK
|
||||
end
|
||||
|
||||
def inspect
|
||||
str = "#<Dirent:#{name.inspect}"
|
||||
# perhaps i should remove the data snippet. its not that useful anymore.
|
||||
if file?
|
||||
tmp = read 9
|
||||
data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
|
||||
str << " size=#{size}" +
|
||||
"#{time ? ' time=' + time.to_s.inspect : nil}" +
|
||||
" data=#{data.inspect}"
|
||||
else
|
||||
# there is some dir specific stuff. like clsid, flags.
|
||||
end
|
||||
str + '>'
|
||||
end
|
||||
|
||||
# --------
|
||||
# and for creation of a dirent. don't like the name. is it a file or a directory?
|
||||
# assign to type later? io will be empty.
|
||||
def new_child type
|
||||
child = Dirent.new ole, type
|
||||
children << child
|
||||
yield child if block_given?
|
||||
child
|
||||
end
|
||||
|
||||
def delete child
|
||||
# remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
|
||||
raise "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
|
||||
# free our blocks
|
||||
child.open { |io| io.truncate 0 }
|
||||
end
|
||||
|
||||
def self.copy src, dst
|
||||
# copies the contents of src to dst. must be the same type. this will throw an
|
||||
# error on copying to root. maybe this will recurse too much for big documents??
|
||||
raise unless src.type == dst.type
|
||||
dst.name = src.name
|
||||
if src.dir?
|
||||
src.children.each do |src_child|
|
||||
dst.new_child(src_child.type) { |dst_child| Dirent.copy src_child, dst_child }
|
||||
end
|
||||
else
|
||||
src.open do |src_io|
|
||||
dst.open { |dst_io| IO.copy src_io, dst_io }
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
if $0 == __FILE__
|
||||
puts Ole::Storage.open(ARGV[0]) { |ole| ole.root.to_tree }
|
||||
end
|
||||
require 'ole/storage/base'
|
||||
require 'ole/storage/file_system'
|
||||
require 'ole/storage/meta_data'
|
||||
|
|
|
@ -0,0 +1,916 @@
|
|||
require 'tempfile'
|
||||
|
||||
require 'ole/base'
|
||||
require 'ole/types'
|
||||
require 'ole/ranges_io'
|
||||
|
||||
module Ole # :nodoc:
|
||||
#
|
||||
# This class is the primary way the user interacts with an OLE storage file.
|
||||
#
|
||||
# = TODO
|
||||
#
|
||||
# * the custom header cruft for Header and Dirent needs some love.
|
||||
# * i have a number of classes doing load/save combos: Header, AllocationTable, Dirent,
|
||||
# and, in a manner of speaking, but arguably different, Storage itself.
|
||||
# they have differing api's which would be nice to rethink.
|
||||
# AllocationTable::Big must be created aot now, as it is used for all subsequent reads.
|
||||
#
|
||||
class Storage
|
||||
# thrown for any bogus OLE file errors.
|
||||
class FormatError < StandardError # :nodoc:
|
||||
end
|
||||
|
||||
VERSION = '1.2.8.2'
|
||||
|
||||
# options used at creation time
|
||||
attr_reader :params
|
||||
# The top of the ole tree structure
|
||||
attr_reader :root
|
||||
# The tree structure in its original flattened form. only valid after #load, or #flush.
|
||||
attr_reader :dirents
|
||||
# The underlying io object to/from which the ole object is serialized, whether we
|
||||
# should close it, and whether it is writeable
|
||||
attr_reader :io, :close_parent, :writeable
|
||||
# Low level internals, you probably shouldn't need to mess with these
|
||||
attr_reader :header, :bbat, :sbat, :sb_file
|
||||
|
||||
# +arg+ should be either a filename, or an +IO+ object, and needs to be seekable.
|
||||
# +mode+ is optional, and should be a regular mode string.
|
||||
def initialize arg, mode=nil, params={}
|
||||
params, mode = mode, nil if Hash === mode
|
||||
params = {:update_timestamps => true}.merge(params)
|
||||
@params = params
|
||||
|
||||
# get the io object
|
||||
@close_parent, @io = if String === arg
|
||||
mode ||= 'rb'
|
||||
[true, open(arg, mode)]
|
||||
else
|
||||
raise ArgumentError, 'unable to specify mode string with io object' if mode
|
||||
[false, arg]
|
||||
end
|
||||
# do we have this file opened for writing? don't know of a better way to tell
|
||||
# (unless we parse the mode string in the open case)
|
||||
# hmmm, note that in ruby 1.9 this doesn't work anymore. which is all the more
|
||||
# reason to use mode string parsing when available, and fall back to something like
|
||||
# io.writeable? otherwise.
|
||||
@writeable = begin
|
||||
if mode
|
||||
IO::Mode.new(mode).writeable?
|
||||
else
|
||||
@io.flush
|
||||
# this is for the benefit of ruby-1.9
|
||||
@io.syswrite('') if @io.respond_to?(:syswrite)
|
||||
true
|
||||
end
|
||||
rescue IOError
|
||||
false
|
||||
end
|
||||
# silence undefined warning in clear
|
||||
@sb_file = nil
|
||||
# if the io object has data, we should load it, otherwise start afresh
|
||||
# this should be based on the mode string rather.
|
||||
@io.size > 0 ? load : clear
|
||||
end
|
||||
|
||||
# somewhat similar to File.open, the open class method allows a block form where
|
||||
# the Ole::Storage object is automatically closed on completion of the block.
|
||||
def self.open arg, mode=nil, params={}
|
||||
ole = new arg, mode, params
|
||||
if block_given?
|
||||
begin yield ole
|
||||
ensure; ole.close
|
||||
end
|
||||
else ole
|
||||
end
|
||||
end
|
||||
|
||||
# load document from file.
|
||||
#
|
||||
# TODO: implement various allocationtable checks, maybe as a AllocationTable#fsck function :)
|
||||
#
|
||||
# 1. reterminate any chain not ending in EOC.
|
||||
# compare file size with actually allocated blocks per file.
|
||||
# 2. pass through all chain heads looking for collisions, and making sure nothing points to them
|
||||
# (ie they are really heads). in both sbat and mbat
|
||||
# 3. we know the locations of the bbat data, and mbat data. ensure that there are placeholder blocks
|
||||
# in the bat for them.
|
||||
# 4. maybe a check of excess data. if there is data outside the bbat.truncate.length + 1 * block_size,
|
||||
# (eg what is used for truncate in #flush), then maybe add some sort of message about that. it
|
||||
# will be automatically thrown away at close time.
|
||||
def load
|
||||
# we always read 512 for the header block. if the block size ends up being different,
|
||||
# what happens to the 109 fat entries. are there more/less entries?
|
||||
@io.rewind
|
||||
header_block = @io.read 512
|
||||
@header = Header.new header_block
|
||||
|
||||
# create an empty bbat.
|
||||
@bbat = AllocationTable::Big.new self
|
||||
bbat_chain = header_block[Header::SIZE..-1].unpack 'V*'
|
||||
mbat_block = @header.mbat_start
|
||||
@header.num_mbat.times do
|
||||
blocks = @bbat.read([mbat_block]).unpack 'V*'
|
||||
mbat_block = blocks.pop
|
||||
bbat_chain += blocks
|
||||
end
|
||||
# am i using num_bat in the right way?
|
||||
@bbat.load @bbat.read(bbat_chain[0, @header.num_bat])
|
||||
|
||||
# get block chain for directories, read it, then split it into chunks and load the
|
||||
# directory entries. semantics changed - used to cut at first dir where dir.type == 0
|
||||
@dirents = @bbat.read(@header.dirent_start).to_enum(:each_chunk, Dirent::SIZE).
|
||||
map { |str| Dirent.new self, str }.reject { |d| d.type_id == 0 }
|
||||
|
||||
# now reorder from flat into a tree
|
||||
# links are stored in some kind of balanced binary tree
|
||||
# check that everything is visited at least, and at most once
|
||||
# similarly with the blocks of the file.
|
||||
# was thinking of moving this to Dirent.to_tree instead.
|
||||
class << @dirents
|
||||
def to_tree idx=0
|
||||
return [] if idx == Dirent::EOT
|
||||
d = self[idx]
|
||||
d.children = to_tree d.child
|
||||
raise FormatError, "directory #{d.inspect} used twice" if d.idx
|
||||
d.idx = idx
|
||||
to_tree(d.prev) + [d] + to_tree(d.next)
|
||||
end
|
||||
end
|
||||
|
||||
@root = @dirents.to_tree.first
|
||||
Log.warn "root name was #{@root.name.inspect}" unless @root.name == 'Root Entry'
|
||||
unused = @dirents.reject(&:idx).length
|
||||
Log.warn "#{unused} unused directories" if unused > 0
|
||||
|
||||
# FIXME i don't currently use @header.num_sbat which i should
|
||||
# hmm. nor do i write it. it means what exactly again?
|
||||
# which mode to use here?
|
||||
@sb_file = RangesIOResizeable.new @bbat, :first_block => @root.first_block, :size => @root.size
|
||||
@sbat = AllocationTable::Small.new self
|
||||
@sbat.load @bbat.read(@header.sbat_start)
|
||||
end
|
||||
|
||||
def close
|
||||
@sb_file.close
|
||||
flush if @writeable
|
||||
@io.close if @close_parent
|
||||
end
|
||||
|
||||
# the flush method is the main "save" method. all file contents are always
|
||||
# written directly to the file by the RangesIO objects, all this method does
|
||||
# is write out all the file meta data - dirents, allocation tables, file header
|
||||
# etc.
|
||||
#
|
||||
# maybe add an option to zero the padding, and any remaining avail blocks in the
|
||||
# allocation table.
|
||||
#
|
||||
# TODO: long and overly complex. simplify and test better. eg, perhaps move serialization
|
||||
# of bbat to AllocationTable::Big.
|
||||
def flush
|
||||
# update root dirent, and flatten dirent tree
|
||||
@root.name = 'Root Entry'
|
||||
@root.first_block = @sb_file.first_block
|
||||
@root.size = @sb_file.size
|
||||
@dirents = @root.flatten
|
||||
|
||||
# serialize the dirents using the bbat
|
||||
RangesIOResizeable.open @bbat, 'w', :first_block => @header.dirent_start do |io|
|
||||
@dirents.each { |dirent| io.write dirent.to_s }
|
||||
padding = (io.size / @bbat.block_size.to_f).ceil * @bbat.block_size - io.size
|
||||
io.write 0.chr * padding
|
||||
@header.dirent_start = io.first_block
|
||||
end
|
||||
|
||||
# serialize the sbat
|
||||
# perhaps the blocks used by the sbat should be marked with BAT?
|
||||
RangesIOResizeable.open @bbat, 'w', :first_block => @header.sbat_start do |io|
|
||||
io.write @sbat.to_s
|
||||
@header.sbat_start = io.first_block
|
||||
@header.num_sbat = @bbat.chain(@header.sbat_start).length
|
||||
end
|
||||
|
||||
# create RangesIOResizeable hooked up to the bbat. use that to claim bbat blocks using
|
||||
# truncate. then when its time to write, convert that chain and some chunk of blocks at
|
||||
# the end, into META_BAT blocks. write out the chain, and those meta bat blocks, and its
|
||||
# done.
|
||||
# this is perhaps not good, as we reclaim all bat blocks here, which
|
||||
# may include the sbat we just wrote. FIXME
|
||||
@bbat.map! do |b|
|
||||
b == AllocationTable::BAT || b == AllocationTable::META_BAT ? AllocationTable::AVAIL : b
|
||||
end
|
||||
|
||||
# currently we use a loop. this could be better, but basically,
|
||||
# the act of writing out the bat, itself requires blocks which get
|
||||
# recorded in the bat.
|
||||
#
|
||||
# i'm sure that there'd be some simpler closed form solution to this. solve
|
||||
# recursive func:
|
||||
#
|
||||
# num_mbat_blocks = ceil(max((mbat_len - 109) * 4 / block_size, 0))
|
||||
# bbat_len = initial_bbat_len + num_mbat_blocks
|
||||
# mbat_len = ceil(bbat_len * 4 / block_size)
|
||||
#
|
||||
# the actual bbat allocation table is itself stored throughout the file, and that chain
|
||||
# is stored in the initial blocks, and the mbat blocks.
|
||||
num_mbat_blocks = 0
|
||||
io = RangesIOResizeable.new @bbat, 'w', :first_block => AllocationTable::EOC
|
||||
# truncate now, so that we can simplify size calcs - the mbat blocks will be appended in a
|
||||
# contiguous chunk at the end.
|
||||
# hmmm, i think this truncate should be matched with a truncate of the underlying io. if you
|
||||
# delete a lot of stuff, and free up trailing blocks, the file size never shrinks. this can
|
||||
# be fixed easily, add an io truncate
|
||||
@bbat.truncate!
|
||||
before = @io.size
|
||||
@io.truncate @bbat.block_size * (@bbat.length + 1)
|
||||
while true
|
||||
# get total bbat size. equivalent to @bbat.to_s.length, but for the factoring in of
|
||||
# the mbat blocks. we can't just add the mbat blocks directly to the bbat, as as this iteration
|
||||
# progresses, more blocks may be needed for the bat itself (if there are no more gaps), and the
|
||||
# mbat must remain contiguous.
|
||||
bbat_data_len = ((@bbat.length + num_mbat_blocks) * 4 / @bbat.block_size.to_f).ceil * @bbat.block_size
|
||||
# now storing the excess mbat blocks also increases the size of the bbat:
|
||||
new_num_mbat_blocks = ([bbat_data_len / @bbat.block_size - 109, 0].max * 4 / (@bbat.block_size.to_f - 4)).ceil
|
||||
if new_num_mbat_blocks != num_mbat_blocks
|
||||
# need more space for the mbat.
|
||||
num_mbat_blocks = new_num_mbat_blocks
|
||||
elsif io.size != bbat_data_len
|
||||
# need more space for the bat
|
||||
# this may grow the bbat, depending on existing available blocks
|
||||
io.truncate bbat_data_len
|
||||
else
|
||||
break
|
||||
end
|
||||
end
|
||||
|
||||
# now extract the info we want:
|
||||
ranges = io.ranges
|
||||
bbat_chain = @bbat.chain io.first_block
|
||||
io.close
|
||||
bbat_chain.each { |b| @bbat[b] = AllocationTable::BAT }
|
||||
# tack on the mbat stuff
|
||||
@header.num_bat = bbat_chain.length
|
||||
mbat_blocks = (0...num_mbat_blocks).map do
|
||||
block = @bbat.free_block
|
||||
@bbat[block] = AllocationTable::META_BAT
|
||||
block
|
||||
end
|
||||
@header.mbat_start = mbat_blocks.first || AllocationTable::EOC
|
||||
|
||||
# now finally write the bbat, using a not resizable io.
|
||||
# the mode here will be 'r', which allows write atm.
|
||||
RangesIO.open(@io, :ranges => ranges) { |f| f.write @bbat.to_s }
|
||||
|
||||
# this is the mbat. pad it out.
|
||||
bbat_chain += [AllocationTable::AVAIL] * [109 - bbat_chain.length, 0].max
|
||||
@header.num_mbat = num_mbat_blocks
|
||||
if num_mbat_blocks != 0
|
||||
# write out the mbat blocks now. first of all, where are they going to be?
|
||||
mbat_data = bbat_chain[109..-1]
|
||||
# expand the mbat_data to include the linked list forward pointers.
|
||||
mbat_data = mbat_data.to_enum(:each_slice, @bbat.block_size / 4 - 1).to_a.
|
||||
zip(mbat_blocks[1..-1] + [nil]).map { |a, b| b ? a + [b] : a }
|
||||
# pad out the last one.
|
||||
mbat_data.last.push(*([AllocationTable::AVAIL] * (@bbat.block_size / 4 - mbat_data.last.length)))
|
||||
RangesIO.open @io, :ranges => @bbat.ranges(mbat_blocks) do |f|
|
||||
f.write mbat_data.flatten.pack('V*')
|
||||
end
|
||||
end
|
||||
|
||||
# now seek back and write the header out
|
||||
@io.seek 0
|
||||
@io.write @header.to_s + bbat_chain[0, 109].pack('V*')
|
||||
@io.flush
|
||||
end
|
||||
|
||||
def clear
|
||||
# initialize to equivalent of loading an empty ole document.
|
||||
Log.warn 'creating new ole storage object on non-writable io' unless @writeable
|
||||
@header = Header.new
|
||||
@bbat = AllocationTable::Big.new self
|
||||
@root = Dirent.new self, :type => :root, :name => 'Root Entry'
|
||||
@dirents = [@root]
|
||||
@root.idx = 0
|
||||
@sb_file.close if @sb_file
|
||||
@sb_file = RangesIOResizeable.new @bbat, :first_block => AllocationTable::EOC
|
||||
@sbat = AllocationTable::Small.new self
|
||||
# throw everything else the hell away
|
||||
@io.truncate 0
|
||||
end
|
||||
|
||||
# could be useful with mis-behaving ole documents. or to just clean them up.
|
||||
def repack temp=:file
|
||||
case temp
|
||||
when :file
|
||||
Tempfile.open 'ole-repack' do |io|
|
||||
io.binmode
|
||||
repack_using_io io
|
||||
end
|
||||
when :mem; StringIO.open('', &method(:repack_using_io))
|
||||
else raise ArgumentError, "unknown temp backing #{temp.inspect}"
|
||||
end
|
||||
end
|
||||
|
||||
def repack_using_io temp_io
|
||||
@io.rewind
|
||||
IO.copy @io, temp_io
|
||||
clear
|
||||
Storage.open temp_io, nil, @params do |temp_ole|
|
||||
#temp_ole.root.type = :dir
|
||||
Dirent.copy temp_ole.root, root
|
||||
end
|
||||
end
|
||||
|
||||
def bat_for_size size
|
||||
# note >=, not > previously.
|
||||
size >= @header.threshold ? @bbat : @sbat
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class} io=#{@io.inspect} root=#{@root.inspect}>"
|
||||
end
|
||||
|
||||
#
|
||||
# A class which wraps the ole header
|
||||
#
|
||||
# Header.new can be both used to load from a string, or to create from
|
||||
# defaults. Serialization is accomplished with the #to_s method.
|
||||
#
|
||||
class Header < Struct.new(
|
||||
:magic, :clsid, :minor_ver, :major_ver, :byte_order, :b_shift, :s_shift,
|
||||
:reserved, :csectdir, :num_bat, :dirent_start, :transacting_signature, :threshold,
|
||||
:sbat_start, :num_sbat, :mbat_start, :num_mbat
|
||||
)
|
||||
PACK = 'a8 a16 v2 a2 v2 a6 V3 a4 V5'
|
||||
SIZE = 0x4c
|
||||
# i have seen it pointed out that the first 4 bytes of hex,
|
||||
# 0xd0cf11e0, is supposed to spell out docfile. hmmm :)
|
||||
MAGIC = "\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1" # expected value of Header#magic
|
||||
# what you get if creating new header from scratch.
|
||||
# AllocationTable::EOC isn't available yet. meh.
|
||||
EOC = 0xfffffffe
|
||||
DEFAULT = [
|
||||
MAGIC, 0.chr * 16, 59, 3, "\xfe\xff", 9, 6,
|
||||
0.chr * 6, 0, 1, EOC, 0.chr * 4,
|
||||
4096, EOC, 0, EOC, 0
|
||||
]
|
||||
|
||||
def initialize values=DEFAULT
|
||||
values = values.unpack(PACK) if String === values
|
||||
super(*values)
|
||||
validate!
|
||||
end
|
||||
|
||||
def to_s
|
||||
to_a.pack PACK
|
||||
end
|
||||
|
||||
def validate!
|
||||
raise FormatError, "OLE2 signature is invalid" unless magic == MAGIC
|
||||
if num_bat == 0 or # is that valid for a completely empty file?
|
||||
# not sure about this one. basically to do max possible bat given size of mbat
|
||||
num_bat > 109 && num_bat > 109 + num_mbat * (1 << b_shift - 2) or
|
||||
# shouldn't need to use the mbat as there is enough space in the header block
|
||||
num_bat < 109 && num_mbat != 0 or
|
||||
# given the size of the header is 76, if b_shift <= 6, blocks address the header.
|
||||
s_shift > b_shift or b_shift <= 6 or b_shift >= 31 or
|
||||
# we only handle little endian
|
||||
byte_order != "\xfe\xff"
|
||||
raise FormatError, "not valid OLE2 structured storage file"
|
||||
end
|
||||
# relaxed this, due to test-msg/qwerty_[1-3]*.msg they all had
|
||||
# 3 for this value.
|
||||
# transacting_signature != "\x00" * 4 or
|
||||
if threshold != 4096 or
|
||||
num_mbat == 0 && mbat_start != AllocationTable::EOC or
|
||||
reserved != "\x00" * 6
|
||||
Log.warn "may not be a valid OLE2 structured storage file"
|
||||
end
|
||||
true
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# +AllocationTable+'s hold the chains corresponding to files. Given
|
||||
# an initial index, <tt>AllocationTable#chain</tt> follows the chain, returning
|
||||
# the blocks that make up that file.
|
||||
#
|
||||
# There are 2 allocation tables, the bbat, and sbat, for big and small
|
||||
# blocks respectively. The block chain should be loaded using either
|
||||
# <tt>Storage#read_big_blocks</tt> or <tt>Storage#read_small_blocks</tt>
|
||||
# as appropriate.
|
||||
#
|
||||
# Whether or not big or small blocks are used for a file depends on
|
||||
# whether its size is over the <tt>Header#threshold</tt> level.
|
||||
#
|
||||
# An <tt>Ole::Storage</tt> document is serialized as a series of directory objects,
|
||||
# which are stored in blocks throughout the file. The blocks are either
|
||||
# big or small, and are accessed using the <tt>AllocationTable</tt>.
|
||||
#
|
||||
# The bbat allocation table's data is stored in the spare room in the header
|
||||
# block, and in extra blocks throughout the file as referenced by the meta
|
||||
# bat. That chain is linear, as there is no higher level table.
|
||||
#
|
||||
# AllocationTable.new is used to create an empty table. It can parse a string
|
||||
# with the #load method. Serialization is accomplished with the #to_s method.
|
||||
#
|
||||
class AllocationTable < Array
|
||||
# a free block (I don't currently leave any blocks free), although I do pad out
|
||||
# the allocation table with AVAIL to the block size.
|
||||
AVAIL = 0xffffffff
|
||||
EOC = 0xfffffffe # end of a chain
|
||||
# these blocks are used for storing the allocation table chains
|
||||
BAT = 0xfffffffd
|
||||
META_BAT = 0xfffffffc
|
||||
|
||||
attr_reader :ole, :io, :block_size
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
@sparse = true
|
||||
super()
|
||||
end
|
||||
|
||||
def load data
|
||||
replace data.unpack('V*')
|
||||
end
|
||||
|
||||
def truncate
|
||||
# this strips trailing AVAILs. come to think of it, this has the potential to break
|
||||
# bogus ole. if you terminate using AVAIL instead of EOC, like I did before. but that is
|
||||
# very broken. however, if a chain ends with AVAIL, it should probably be fixed to EOC
|
||||
# at load time.
|
||||
temp = reverse
|
||||
not_avail = temp.find { |b| b != AVAIL } and temp = temp[temp.index(not_avail)..-1]
|
||||
temp.reverse
|
||||
end
|
||||
|
||||
def truncate!
|
||||
replace truncate
|
||||
end
|
||||
|
||||
def to_s
|
||||
table = truncate
|
||||
# pad it out some
|
||||
num = @ole.bbat.block_size / 4
|
||||
# do you really use AVAIL? they probably extend past end of file, and may shortly
|
||||
# be used for the bat. not really good.
|
||||
table += [AVAIL] * (num - (table.length % num)) if (table.length % num) != 0
|
||||
table.pack 'V*'
|
||||
end
|
||||
|
||||
# rewrote this to be non-recursive as it broke on a large attachment
|
||||
# chain with a stack error
|
||||
def chain idx
|
||||
a = []
|
||||
until idx >= META_BAT
|
||||
raise FormatError, "broken allocationtable chain" if idx < 0 || idx > length
|
||||
a << idx
|
||||
idx = self[idx]
|
||||
end
|
||||
Log.warn "invalid chain terminator #{idx}" unless idx == EOC
|
||||
a
|
||||
end
|
||||
|
||||
# Turn a chain (an array given by +chain+) of blocks (optionally
|
||||
# truncated to +size+) into an array of arrays describing the stretches of
|
||||
# bytes in the file that it belongs to.
|
||||
#
|
||||
# The blocks are Big or Small blocks depending on the table type.
|
||||
def blocks_to_ranges chain, size=nil
|
||||
# truncate the chain if required
|
||||
chain = chain[0...(size.to_f / block_size).ceil] if size
|
||||
# convert chain to ranges of the block size
|
||||
ranges = chain.map { |i| [block_size * i, block_size] }
|
||||
# truncate final range if required
|
||||
ranges.last[1] -= (ranges.length * block_size - size) if ranges.last and size
|
||||
ranges
|
||||
end
|
||||
|
||||
def ranges chain, size=nil
|
||||
chain = self.chain(chain) unless Array === chain
|
||||
blocks_to_ranges chain, size
|
||||
end
|
||||
|
||||
# quick shortcut. chain can be either a head (in which case the table is used to
|
||||
# turn it into a chain), or a chain. it is converted to ranges, then to rangesio.
|
||||
def open chain, size=nil, &block
|
||||
RangesIO.open @io, :ranges => ranges(chain, size), &block
|
||||
end
|
||||
|
||||
def read chain, size=nil
|
||||
open chain, size, &:read
|
||||
end
|
||||
|
||||
# catch any method that may add an AVAIL somewhere in the middle, thus invalidating
|
||||
# the @sparse speedup for free_block. annoying using eval, but define_method won't
|
||||
# work for this.
|
||||
# FIXME
|
||||
[:map!, :collect!].each do |name|
|
||||
eval <<-END
|
||||
def #{name}(*args, &block)
|
||||
@sparse = true
|
||||
super
|
||||
end
|
||||
END
|
||||
end
|
||||
|
||||
def []= idx, val
|
||||
@sparse = true if val == AVAIL
|
||||
super
|
||||
end
|
||||
|
||||
def free_block
|
||||
if @sparse
|
||||
i = index(AVAIL) and return i
|
||||
end
|
||||
@sparse = false
|
||||
push AVAIL
|
||||
length - 1
|
||||
end
|
||||
|
||||
# must return first_block. modifies +blocks+ in place
|
||||
def resize_chain blocks, size
|
||||
new_num_blocks = (size / block_size.to_f).ceil
|
||||
old_num_blocks = blocks.length
|
||||
if new_num_blocks < old_num_blocks
|
||||
# de-allocate some of our old blocks. TODO maybe zero them out in the file???
|
||||
(new_num_blocks...old_num_blocks).each { |i| self[blocks[i]] = AVAIL }
|
||||
self[blocks[new_num_blocks-1]] = EOC if new_num_blocks > 0
|
||||
blocks.slice! new_num_blocks..-1
|
||||
elsif new_num_blocks > old_num_blocks
|
||||
# need some more blocks.
|
||||
last_block = blocks.last
|
||||
(new_num_blocks - old_num_blocks).times do
|
||||
block = free_block
|
||||
# connect the chain. handle corner case of blocks being [] initially
|
||||
self[last_block] = block if last_block
|
||||
blocks << block
|
||||
last_block = block
|
||||
self[last_block] = EOC
|
||||
end
|
||||
end
|
||||
# update ranges, and return that also now
|
||||
blocks
|
||||
end
|
||||
|
||||
class Big < AllocationTable
|
||||
def initialize(*args)
|
||||
super
|
||||
@block_size = 1 << @ole.header.b_shift
|
||||
@io = @ole.io
|
||||
end
|
||||
|
||||
# Big blocks are kind of -1 based, in order to not clash with the header.
|
||||
def blocks_to_ranges blocks, size
|
||||
super blocks.map { |b| b + 1 }, size
|
||||
end
|
||||
end
|
||||
|
||||
class Small < AllocationTable
|
||||
def initialize(*args)
|
||||
super
|
||||
@block_size = 1 << @ole.header.s_shift
|
||||
@io = @ole.sb_file
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# like normal RangesIO, but Ole::Storage specific. the ranges are backed by an
|
||||
# AllocationTable, and can be resized. used for read/write to 2 streams:
|
||||
# 1. serialized dirent data
|
||||
# 2. sbat table data
|
||||
# 3. all dirents but through RangesIOMigrateable below
|
||||
#
|
||||
# Note that all internal access to first_block is through accessors, as it is sometimes
|
||||
# useful to redirect it.
|
||||
class RangesIOResizeable < RangesIO
|
||||
attr_reader :bat
|
||||
attr_accessor :first_block
|
||||
def initialize bat, mode='r', params={}
|
||||
mode, params = 'r', mode if Hash === mode
|
||||
first_block, size = params.values_at :first_block, :size
|
||||
raise ArgumentError, 'must specify first_block' unless first_block
|
||||
@bat = bat
|
||||
self.first_block = first_block
|
||||
# we now cache the blocks chain, for faster resizing.
|
||||
@blocks = @bat.chain first_block
|
||||
super @bat.io, mode, :ranges => @bat.ranges(@blocks, size)
|
||||
end
|
||||
|
||||
def truncate size
|
||||
# note that old_blocks is != @ranges.length necessarily. i'm planning to write a
|
||||
# merge_ranges function that merges sequential ranges into one as an optimization.
|
||||
@bat.resize_chain @blocks, size
|
||||
@ranges = @bat.ranges @blocks, size
|
||||
@pos = @size if @pos > size
|
||||
self.first_block = @blocks.empty? ? AllocationTable::EOC : @blocks.first
|
||||
|
||||
# don't know if this is required, but we explicitly request our @io to grow if necessary
|
||||
# we never shrink it though. maybe this belongs in allocationtable, where smarter decisions
|
||||
# can be made.
|
||||
# maybe its ok to just seek out there later??
|
||||
max = @ranges.map { |pos, len| pos + len }.max || 0
|
||||
@io.truncate max if max > @io.size
|
||||
|
||||
@size = size
|
||||
end
|
||||
end
|
||||
|
||||
# like RangesIOResizeable, but Ole::Storage::Dirent specific. provides for migration
|
||||
# between bats based on size, and updating the dirent.
|
||||
class RangesIOMigrateable < RangesIOResizeable
|
||||
attr_reader :dirent
|
||||
def initialize dirent, mode='r'
|
||||
@dirent = dirent
|
||||
super @dirent.ole.bat_for_size(@dirent.size), mode,
|
||||
:first_block => @dirent.first_block, :size => @dirent.size
|
||||
end
|
||||
|
||||
def truncate size
|
||||
bat = @dirent.ole.bat_for_size size
|
||||
if bat.class != @bat.class
|
||||
# bat migration needed! we need to backup some data. the amount of data
|
||||
# should be <= @ole.header.threshold, so we can just hold it all in one buffer.
|
||||
# backup this
|
||||
pos = @pos
|
||||
@pos = 0
|
||||
keep = read [@size, size].min
|
||||
# this does a normal truncate to 0, removing our presence from the old bat, and
|
||||
# rewrite the dirent's first_block
|
||||
super 0
|
||||
@bat = bat
|
||||
# just change the underlying io from right under everyone :)
|
||||
@io = bat.io
|
||||
# important to do this now, before the write. as the below write will always
|
||||
# migrate us back to sbat! this will now allocate us +size+ in the new bat.
|
||||
super
|
||||
@pos = 0
|
||||
write keep
|
||||
@pos = pos
|
||||
else
|
||||
super
|
||||
end
|
||||
# now just update the file
|
||||
@dirent.size = size
|
||||
end
|
||||
|
||||
# forward this to the dirent
|
||||
def first_block
|
||||
@dirent.first_block
|
||||
end
|
||||
|
||||
def first_block= val
|
||||
@dirent.first_block = val
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# A class which wraps an ole directory entry. Can be either a directory
|
||||
# (<tt>Dirent#dir?</tt>) or a file (<tt>Dirent#file?</tt>)
|
||||
#
|
||||
# Most interaction with <tt>Ole::Storage</tt> is through this class.
|
||||
# The 2 most important functions are <tt>Dirent#children</tt>, and
|
||||
# <tt>Dirent#data</tt>.
|
||||
#
|
||||
# was considering separate classes for dirs and files. some methods/attrs only
|
||||
# applicable to one or the other.
|
||||
#
|
||||
# As with the other classes, #to_s performs the serialization.
|
||||
#
|
||||
class Dirent < Struct.new(
|
||||
:name_utf16, :name_len, :type_id, :colour, :prev, :next, :child,
|
||||
:clsid, :flags, # dirs only
|
||||
:create_time_str, :modify_time_str, # files only
|
||||
:first_block, :size, :reserved
|
||||
)
|
||||
include RecursivelyEnumerable
|
||||
|
||||
PACK = 'a64 v C C V3 a16 V a8 a8 V2 a4'
|
||||
SIZE = 128
|
||||
TYPE_MAP = {
|
||||
# this is temporary
|
||||
0 => :empty,
|
||||
1 => :dir,
|
||||
2 => :file,
|
||||
5 => :root
|
||||
}
|
||||
# something to do with the fact that the tree is supposed to be red-black
|
||||
COLOUR_MAP = {
|
||||
0 => :red,
|
||||
1 => :black
|
||||
}
|
||||
# used in the next / prev / child stuff to show that the tree ends here.
|
||||
# also used for first_block for directory.
|
||||
EOT = 0xffffffff
|
||||
DEFAULT = [
|
||||
0.chr * 2, 2, 0, # will get overwritten
|
||||
1, EOT, EOT, EOT,
|
||||
0.chr * 16, 0, nil, nil,
|
||||
AllocationTable::EOC, 0, 0.chr * 4
|
||||
]
|
||||
|
||||
# i think its just used by the tree building
|
||||
attr_accessor :idx
|
||||
# This returns all the children of this +Dirent+. It is filled in
|
||||
# when the tree structure is recreated.
|
||||
attr_accessor :children
|
||||
attr_accessor :name
|
||||
attr_reader :ole, :type, :create_time, :modify_time
|
||||
def initialize ole, values=DEFAULT, params={}
|
||||
@ole = ole
|
||||
values, params = DEFAULT, values if Hash === values
|
||||
values = values.unpack(PACK) if String === values
|
||||
super(*values)
|
||||
|
||||
# extra parsing from the actual struct values
|
||||
@name = params[:name] || Types::Variant.load(Types::VT_LPWSTR, name_utf16[0...name_len])
|
||||
@type = if params[:type]
|
||||
unless TYPE_MAP.values.include?(params[:type])
|
||||
raise ArgumentError, "unknown type #{params[:type].inspect}"
|
||||
end
|
||||
params[:type]
|
||||
else
|
||||
TYPE_MAP[type_id] or raise FormatError, "unknown type_id #{type_id.inspect}"
|
||||
end
|
||||
|
||||
# further extra type specific stuff
|
||||
if file?
|
||||
default_time = @ole.params[:update_timestamps] ? Time.now : nil
|
||||
@create_time ||= default_time
|
||||
@modify_time ||= default_time
|
||||
@create_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if create_time_str
|
||||
@modify_time = Types::Variant.load(Types::VT_FILETIME, create_time_str) if modify_time_str
|
||||
@children = nil
|
||||
else
|
||||
@create_time = nil
|
||||
@modify_time = nil
|
||||
self.size = 0 unless @type == :root
|
||||
@children = []
|
||||
end
|
||||
|
||||
# to silence warnings. used for tree building at load time
|
||||
# only.
|
||||
@idx = nil
|
||||
end
|
||||
|
||||
def open mode='r'
|
||||
raise Errno::EISDIR unless file?
|
||||
io = RangesIOMigrateable.new self, mode
|
||||
# TODO work on the mode string stuff a bit more.
|
||||
# maybe let the io object know about the mode, so it can refuse
|
||||
# to work for read/write appropriately. maybe redefine all unusable
|
||||
# methods using singleton class to throw errors.
|
||||
# for now, i just want to implement truncation on use of 'w'. later,
|
||||
# i need to do 'a' etc.
|
||||
case mode
|
||||
when 'r', 'r+'
|
||||
# as i don't enforce reading/writing, nothing changes here. kind of
|
||||
# need to enforce tt if i want modify times to work better.
|
||||
@modify_time = Time.now if mode == 'r+'
|
||||
when 'w'
|
||||
@modify_time = Time.now
|
||||
# io.truncate 0
|
||||
#else
|
||||
# raise NotImplementedError, "unsupported mode - #{mode.inspect}"
|
||||
end
|
||||
if block_given?
|
||||
begin yield io
|
||||
ensure; io.close
|
||||
end
|
||||
else io
|
||||
end
|
||||
end
|
||||
|
||||
def read limit=nil
|
||||
open { |io| io.read limit }
|
||||
end
|
||||
|
||||
def file?
|
||||
type == :file
|
||||
end
|
||||
|
||||
def dir?
|
||||
# to count root as a dir.
|
||||
!file?
|
||||
end
|
||||
|
||||
# maybe need some options regarding case sensitivity.
|
||||
def / name
|
||||
children.find { |child| name === child.name }
|
||||
end
|
||||
|
||||
def [] idx
|
||||
if String === idx
|
||||
#warn 'String form of Dirent#[] is deprecated'
|
||||
self / idx
|
||||
else
|
||||
super
|
||||
end
|
||||
end
|
||||
|
||||
# move to ruby-msg. and remove from here
|
||||
def time
|
||||
#warn 'Dirent#time is deprecated'
|
||||
create_time || modify_time
|
||||
end
|
||||
|
||||
def each_child(&block)
|
||||
@children.each(&block)
|
||||
end
|
||||
|
||||
# flattens the tree starting from here into +dirents+. note it modifies its argument.
|
||||
def flatten dirents=[]
|
||||
@idx = dirents.length
|
||||
dirents << self
|
||||
if file?
|
||||
self.prev = self.next = self.child = EOT
|
||||
else
|
||||
children.each { |child| child.flatten dirents }
|
||||
self.child = Dirent.flatten_helper children
|
||||
end
|
||||
dirents
|
||||
end
|
||||
|
||||
# i think making the tree structure optimized is actually more complex than this, and
|
||||
# requires some intelligent ordering of the children based on names, but as long as
|
||||
# it is valid its ok.
|
||||
# actually, i think its ok. gsf for example only outputs a singly-linked-list, where
|
||||
# prev is always EOT.
|
||||
def self.flatten_helper children
|
||||
return EOT if children.empty?
|
||||
i = children.length / 2
|
||||
this = children[i]
|
||||
this.prev, this.next = [(0...i), (i+1..-1)].map { |r| flatten_helper children[r] }
|
||||
this.idx
|
||||
end
|
||||
|
||||
def to_s
|
||||
tmp = Types::Variant.dump(Types::VT_LPWSTR, name)
|
||||
tmp = tmp[0, 62] if tmp.length > 62
|
||||
tmp += 0.chr * 2
|
||||
self.name_len = tmp.length
|
||||
self.name_utf16 = tmp + 0.chr * (64 - tmp.length)
|
||||
# type_id can perhaps be set in the initializer, as its read only now.
|
||||
self.type_id = TYPE_MAP.to_a.find { |id, name| @type == name }.first
|
||||
# for the case of files, it is assumed that that was handled already
|
||||
# note not dir?, so as not to override root's first_block
|
||||
self.first_block = Dirent::EOT if type == :dir
|
||||
if file?
|
||||
# this is messed up. it changes the time stamps regardless of whether the file
|
||||
# was actually touched. instead, any open call with a writeable mode, should update
|
||||
# the modify time. create time would be set in new.
|
||||
if @ole.params[:update_timestamps]
|
||||
self.create_time_str = Types::Variant.dump Types::VT_FILETIME, @create_time
|
||||
self.modify_time_str = Types::Variant.dump Types::VT_FILETIME, @modify_time
|
||||
end
|
||||
else
|
||||
self.create_time_str = 0.chr * 8
|
||||
self.modify_time_str = 0.chr * 8
|
||||
end
|
||||
to_a.pack PACK
|
||||
end
|
||||
|
||||
def inspect
|
||||
str = "#<Dirent:#{name.inspect}"
|
||||
# perhaps i should remove the data snippet. its not that useful anymore.
|
||||
# there is also some dir specific stuff. like clsid, flags, that i should
|
||||
# probably include
|
||||
if file?
|
||||
tmp = read 9
|
||||
data = tmp.length == 9 ? tmp[0, 5] + '...' : tmp
|
||||
str << " size=#{size}" +
|
||||
"#{modify_time ? ' modify_time=' + modify_time.to_s.inspect : nil}" +
|
||||
" data=#{data.inspect}"
|
||||
end
|
||||
str + '>'
|
||||
end
|
||||
|
||||
def delete child
|
||||
# remove from our child array, so that on reflatten and re-creation of @dirents, it will be gone
|
||||
raise ArgumentError, "#{child.inspect} not a child of #{self.inspect}" unless @children.delete child
|
||||
# free our blocks
|
||||
child.open { |io| io.truncate 0 }
|
||||
end
|
||||
|
||||
def self.copy src, dst
|
||||
# copies the contents of src to dst. must be the same type. this will throw an
|
||||
# error on copying to root. maybe this will recurse too much for big documents??
|
||||
raise ArgumentError, 'differing types' if src.file? and !dst.file?
|
||||
dst.name = src.name
|
||||
if src.dir?
|
||||
src.children.each do |src_child|
|
||||
dst_child = Dirent.new dst.ole, :type => src_child.type
|
||||
dst.children << dst_child
|
||||
Dirent.copy src_child, dst_child
|
||||
end
|
||||
else
|
||||
src.open do |src_io|
|
||||
dst.open { |dst_io| IO.copy src_io, dst_io }
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -0,0 +1,423 @@
|
|||
#
|
||||
# = Introduction
|
||||
#
|
||||
# This file intends to provide file system-like api support, a la <tt>zip/zipfilesystem</tt>.
|
||||
#
|
||||
# = TODO
|
||||
#
|
||||
# - need to implement some more IO functions on RangesIO, like #puts, #print
|
||||
# etc, like AbstractOutputStream from zipfile.
|
||||
#
|
||||
# - check Dir.mkdir, and File.open, and File.rename, to add in filename
|
||||
# length checks (max 32 / 31 or something).
|
||||
# do the automatic truncation, and add in any necessary warnings.
|
||||
#
|
||||
# - File.split('a/') == File.split('a') == ['.', 'a']
|
||||
# the implication of this, is that things that try to force directory
|
||||
# don't work. like, File.rename('a', 'b'), should work if a is a file
|
||||
# or directory, but File.rename('a/', 'b') should only work if a is
|
||||
# a directory. tricky, need to clean things up a bit more.
|
||||
# i think a general path name => dirent method would work, with flags
|
||||
# about what should raise an error.
|
||||
#
|
||||
# - Need to look at streamlining things after getting all the tests passing,
|
||||
# as this file's getting pretty long - almost half the real implementation.
|
||||
# and is probably more inefficient than necessary.
|
||||
# too many exceptions in the expected path of certain functions.
|
||||
#
|
||||
# - should look at profiles before and after switching ruby-msg to use
|
||||
# the filesystem api.
|
||||
#
|
||||
|
||||
require 'ole/storage'
|
||||
|
||||
module Ole # :nodoc:
|
||||
class Storage
|
||||
def file
|
||||
@file ||= FileClass.new self
|
||||
end
|
||||
|
||||
def dir
|
||||
@dir ||= DirClass.new self
|
||||
end
|
||||
|
||||
# tries to get a dirent for path. return nil if it doesn't exist
|
||||
# (change it)
|
||||
def dirent_from_path path
|
||||
dirent = @root
|
||||
path = file.expand_path path
|
||||
path = path.sub(/^\/*/, '').sub(/\/*$/, '').split(/\/+/)
|
||||
until path.empty?
|
||||
return nil if dirent.file?
|
||||
return nil unless dirent = dirent/path.shift
|
||||
end
|
||||
dirent
|
||||
end
|
||||
|
||||
class FileClass
|
||||
class Stat
|
||||
attr_reader :ftype, :size, :blocks, :blksize
|
||||
attr_reader :nlink, :uid, :gid, :dev, :rdev, :ino
|
||||
def initialize dirent
|
||||
@dirent = dirent
|
||||
@size = dirent.size
|
||||
if file?
|
||||
@ftype = 'file'
|
||||
bat = dirent.ole.bat_for_size(dirent.size)
|
||||
@blocks = bat.chain(dirent.first_block).length
|
||||
@blksize = bat.block_size
|
||||
else
|
||||
@ftype = 'directory'
|
||||
@blocks = 0
|
||||
@blksize = 0
|
||||
end
|
||||
# a lot of these are bogus. ole file format has no analogs
|
||||
@nlink = 1
|
||||
@uid, @gid = 0, 0
|
||||
@dev, @rdev = 0, 0
|
||||
@ino = 0
|
||||
# need to add times - atime, mtime, ctime.
|
||||
end
|
||||
|
||||
alias rdev_major :rdev
|
||||
alias rdev_minor :rdev
|
||||
|
||||
def file?
|
||||
@dirent.file?
|
||||
end
|
||||
|
||||
def directory?
|
||||
@dirent.dir?
|
||||
end
|
||||
|
||||
def size?
|
||||
size if file?
|
||||
end
|
||||
|
||||
def inspect
|
||||
pairs = (instance_variables - ['@dirent']).map do |n|
|
||||
"#{n[1..-1]}=#{instance_variable_get n}"
|
||||
end
|
||||
"#<#{self.class} #{pairs * ', '}>"
|
||||
end
|
||||
end
|
||||
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
end
|
||||
|
||||
def expand_path path
|
||||
# get the raw stored pwd value (its blank for root)
|
||||
pwd = @ole.dir.instance_variable_get :@pwd
|
||||
# its only absolute if it starts with a '/'
|
||||
path = "#{pwd}/#{path}" unless path =~ /^\//
|
||||
# at this point its already absolute. we use File.expand_path
|
||||
# just for the .. and . handling
|
||||
# No longer use RUBY_PLATFORM =~ /win/ as it matches darwin. better way?
|
||||
File.expand_path(path)[File::ALT_SEPARATOR == "\\" ? (2..-1) : (0..-1)]
|
||||
end
|
||||
|
||||
# +orig_path+ is just so that we can use the requested path
|
||||
# in the error messages even if it has been already modified
|
||||
def dirent_from_path path, orig_path=nil
|
||||
orig_path ||= path
|
||||
dirent = @ole.dirent_from_path path
|
||||
raise Errno::ENOENT, orig_path unless dirent
|
||||
raise Errno::EISDIR, orig_path if dirent.dir?
|
||||
dirent
|
||||
end
|
||||
private :dirent_from_path
|
||||
|
||||
def exists? path
|
||||
!!@ole.dirent_from_path(path)
|
||||
end
|
||||
alias exist? :exists?
|
||||
|
||||
def file? path
|
||||
dirent = @ole.dirent_from_path path
|
||||
dirent and dirent.file?
|
||||
end
|
||||
|
||||
def directory? path
|
||||
dirent = @ole.dirent_from_path path
|
||||
dirent and dirent.dir?
|
||||
end
|
||||
|
||||
def open path, mode='r', &block
|
||||
if IO::Mode.new(mode).create?
|
||||
begin
|
||||
dirent = dirent_from_path path
|
||||
rescue Errno::ENOENT
|
||||
# maybe instead of repeating this everywhere, i should have
|
||||
# a get_parent_dirent function.
|
||||
parent_path, basename = File.split expand_path(path)
|
||||
parent = @ole.dir.send :dirent_from_path, parent_path, path
|
||||
parent.children << dirent = Dirent.new(@ole, :type => :file, :name => basename)
|
||||
end
|
||||
else
|
||||
dirent = dirent_from_path path
|
||||
end
|
||||
dirent.open mode, &block
|
||||
end
|
||||
|
||||
# explicit wrapper instead of alias to inhibit block
|
||||
def new path, mode='r'
|
||||
open path, mode
|
||||
end
|
||||
|
||||
def size path
|
||||
dirent_from_path(path).size
|
||||
rescue Errno::EISDIR
|
||||
# kind of arbitrary. I'm getting 4096 from ::File, but
|
||||
# the zip tests want 0.
|
||||
0
|
||||
end
|
||||
|
||||
def size? path
|
||||
dirent_from_path(path).size
|
||||
# any other exceptions i need to rescue?
|
||||
rescue Errno::ENOENT, Errno::EISDIR
|
||||
nil
|
||||
end
|
||||
|
||||
def stat path
|
||||
# we do this to allow dirs.
|
||||
dirent = @ole.dirent_from_path path
|
||||
raise Errno::ENOENT, path unless dirent
|
||||
Stat.new dirent
|
||||
end
|
||||
|
||||
def read path
|
||||
open path, &:read
|
||||
end
|
||||
|
||||
# most of the work this function does is moving the dirent between
|
||||
# 2 parents. the actual name changing is quite simple.
|
||||
# File.rename can move a file into another folder, which is why i've
|
||||
# done it too, though i think its not always possible...
|
||||
#
|
||||
# FIXME File.rename can be used for directories too....
|
||||
def rename from_path, to_path
|
||||
# check what we want to rename from exists. do it this
|
||||
# way to allow directories.
|
||||
dirent = @ole.dirent_from_path from_path
|
||||
raise Errno::ENOENT, from_path unless dirent
|
||||
# delete what we want to rename to if necessary
|
||||
begin
|
||||
unlink to_path
|
||||
rescue Errno::ENOENT
|
||||
# we actually get here, but rcov doesn't think so. add 1 + 1 to
|
||||
# keep rcov happy for now... :)
|
||||
1 + 1
|
||||
end
|
||||
# reparent the dirent
|
||||
from_parent_path, from_basename = File.split expand_path(from_path)
|
||||
to_parent_path, to_basename = File.split expand_path(to_path)
|
||||
from_parent = @ole.dir.send :dirent_from_path, from_parent_path, from_path
|
||||
to_parent = @ole.dir.send :dirent_from_path, to_parent_path, to_path
|
||||
from_parent.children.delete dirent
|
||||
# and also change its name
|
||||
dirent.name = to_basename
|
||||
to_parent.children << dirent
|
||||
0
|
||||
end
|
||||
|
||||
# crappy copy from Dir.
|
||||
def unlink(*paths)
|
||||
paths.each do |path|
|
||||
dirent = @ole.dirent_from_path path
|
||||
# i think we should free all of our blocks from the
|
||||
# allocation table.
|
||||
# i think if you run repack, all free blocks should get zeroed,
|
||||
# but currently the original data is there unmodified.
|
||||
open(path) { |f| f.truncate 0 }
|
||||
# remove ourself from our parent, so we won't be part of the dir
|
||||
# tree at save time.
|
||||
parent_path, basename = File.split expand_path(path)
|
||||
parent = @ole.dir.send :dirent_from_path, parent_path, path
|
||||
parent.children.delete dirent
|
||||
end
|
||||
paths.length # hmmm. as per ::File ?
|
||||
end
|
||||
alias delete :unlink
|
||||
end
|
||||
|
||||
#
|
||||
# an *instance* of this class is supposed to provide similar methods
|
||||
# to the class methods of Dir itself.
|
||||
#
|
||||
# pretty complete. like zip/zipfilesystem's implementation, i provide
|
||||
# everything except chroot and glob. glob could be done with a glob
|
||||
# to regex regex, and then simply match in the entries array... although
|
||||
# recursive glob complicates that somewhat.
|
||||
#
|
||||
# Dir.chroot, Dir.glob, Dir.[], and Dir.tmpdir is the complete list.
|
||||
class DirClass
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
@pwd = ''
|
||||
end
|
||||
|
||||
# +orig_path+ is just so that we can use the requested path
|
||||
# in the error messages even if it has been already modified
|
||||
def dirent_from_path path, orig_path=nil
|
||||
orig_path ||= path
|
||||
dirent = @ole.dirent_from_path path
|
||||
raise Errno::ENOENT, orig_path unless dirent
|
||||
raise Errno::ENOTDIR, orig_path unless dirent.dir?
|
||||
dirent
|
||||
end
|
||||
private :dirent_from_path
|
||||
|
||||
def open path
|
||||
dir = Dir.new path, entries(path)
|
||||
if block_given?
|
||||
yield dir
|
||||
else
|
||||
dir
|
||||
end
|
||||
end
|
||||
|
||||
# as for file, explicit alias to inhibit block
|
||||
def new path
|
||||
open path
|
||||
end
|
||||
|
||||
# pwd is always stored without the trailing slash. we handle
|
||||
# the root case here
|
||||
def pwd
|
||||
if @pwd.empty?
|
||||
'/'
|
||||
else
|
||||
@pwd
|
||||
end
|
||||
end
|
||||
alias getwd :pwd
|
||||
|
||||
def chdir orig_path
|
||||
# make path absolute, squeeze slashes, and remove trailing slash
|
||||
path = @ole.file.expand_path(orig_path).gsub(/\/+/, '/').sub(/\/$/, '')
|
||||
# this is just for the side effects of the exceptions if invalid
|
||||
dirent_from_path path, orig_path
|
||||
if block_given?
|
||||
old_pwd = @pwd
|
||||
begin
|
||||
@pwd = path
|
||||
yield
|
||||
ensure
|
||||
@pwd = old_pwd
|
||||
end
|
||||
else
|
||||
@pwd = path
|
||||
0
|
||||
end
|
||||
end
|
||||
|
||||
def entries path
|
||||
dirent = dirent_from_path path
|
||||
# Not sure about adding on the dots...
|
||||
entries = %w[. ..] + dirent.children.map(&:name)
|
||||
# do some checks about un-reachable files
|
||||
seen = {}
|
||||
entries.each do |n|
|
||||
Log.warn "inaccessible file (filename contains slash) - #{n.inspect}" if n['/']
|
||||
Log.warn "inaccessible file (duplicate filename) - #{n.inspect}" if seen[n]
|
||||
seen[n] = true
|
||||
end
|
||||
entries
|
||||
end
|
||||
|
||||
def foreach path, &block
|
||||
entries(path).each(&block)
|
||||
end
|
||||
|
||||
# there are some other important ones, like:
|
||||
# chroot (!), glob etc etc. for now, i think
|
||||
def mkdir path
|
||||
# as for rmdir below:
|
||||
parent_path, basename = File.split @ole.file.expand_path(path)
|
||||
# note that we will complain about the full path despite accessing
|
||||
# the parent path. this is consistent with ::Dir
|
||||
parent = dirent_from_path parent_path, path
|
||||
# now, we first should ensure that it doesn't already exist
|
||||
# either as a file or a directory.
|
||||
raise Errno::EEXIST, path if parent/basename
|
||||
parent.children << Dirent.new(@ole, :type => :dir, :name => basename)
|
||||
0
|
||||
end
|
||||
|
||||
def rmdir path
|
||||
dirent = dirent_from_path path
|
||||
raise Errno::ENOTEMPTY, path unless dirent.children.empty?
|
||||
|
||||
# now delete it, how to do that? the canonical representation that is
|
||||
# maintained is the root tree, and the children array. we must remove it
|
||||
# from the children array.
|
||||
# we need the parent then. this sucks but anyway:
|
||||
# we need to split the path. but before we can do that, we need
|
||||
# to expand it first. eg. say we need the parent to unlink
|
||||
# a/b/../c. the parent should be a, not a/b/.., or a/b.
|
||||
parent_path, basename = File.split @ole.file.expand_path(path)
|
||||
# this shouldn't be able to fail if the above didn't
|
||||
parent = dirent_from_path parent_path
|
||||
# note that the way this currently works, on save and repack time this will get
|
||||
# reflected. to work properly, ie to make a difference now it would have to re-write
|
||||
# the dirent. i think that Ole::Storage#close will handle that. and maybe include a
|
||||
# #repack.
|
||||
parent.children.delete dirent
|
||||
0 # hmmm. as per ::Dir ?
|
||||
end
|
||||
alias delete :rmdir
|
||||
alias unlink :rmdir
|
||||
|
||||
# note that there is nothing remotely ole specific about
|
||||
# this class. it simply provides the dir like sequential access
|
||||
# methods on top of an array.
|
||||
# hmm, doesn't throw the IOError's on use of a closed directory...
|
||||
class Dir
|
||||
include Enumerable
|
||||
|
||||
attr_reader :path
|
||||
def initialize path, entries
|
||||
@path, @entries, @pos = path, entries, 0
|
||||
@closed = false
|
||||
end
|
||||
|
||||
def pos
|
||||
raise IOError if @closed
|
||||
@pos
|
||||
end
|
||||
|
||||
def each(&block)
|
||||
raise IOError if @closed
|
||||
@entries.each(&block)
|
||||
end
|
||||
|
||||
def close
|
||||
@closed = true
|
||||
end
|
||||
|
||||
def read
|
||||
raise IOError if @closed
|
||||
@entries[pos]
|
||||
ensure
|
||||
@pos += 1 if pos < @entries.length
|
||||
end
|
||||
|
||||
def pos= pos
|
||||
raise IOError if @closed
|
||||
@pos = [[0, pos].max, @entries.length].min
|
||||
end
|
||||
|
||||
def rewind
|
||||
raise IOError if @closed
|
||||
@pos = 0
|
||||
end
|
||||
|
||||
alias tell :pos
|
||||
alias seek :pos=
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -0,0 +1,148 @@
|
|||
require 'ole/types/property_set'
|
||||
|
||||
module Ole
|
||||
class Storage
|
||||
#
|
||||
# The MetaData class is designed to be high level interface to all the
|
||||
# underlying meta data stored within different sections, themselves within
|
||||
# different property set streams.
|
||||
#
|
||||
# With this class, you can simply get properties using their names, without
|
||||
# needing to know about the underlying guids, property ids etc.
|
||||
#
|
||||
# Example:
|
||||
#
|
||||
# Ole::Storage.open('test.doc') { |ole| p ole.meta_data.doc_author }
|
||||
#
|
||||
# TODO:
|
||||
#
|
||||
# * add write support
|
||||
# * fix some of the missing type coercion (eg FileTime)
|
||||
# * maybe add back the ability to access individual property sets as a unit
|
||||
# directly. ie <tt>ole.summary_information</tt>. Is this useful?
|
||||
# * full key support, for unknown keys, like
|
||||
# <tt>ole.meta_data[myguid, myid]</tt>. probably needed for user-defined
|
||||
# properties too.
|
||||
#
|
||||
class MetaData
|
||||
include Enumerable
|
||||
|
||||
FILE_MAP = {
|
||||
Types::PropertySet::FMTID_SummaryInformation => "\005SummaryInformation",
|
||||
Types::PropertySet::FMTID_DocSummaryInfo => "\005DocumentSummaryInformation"
|
||||
}
|
||||
|
||||
FORMAT_MAP = {
|
||||
'MSWordDoc' => :doc
|
||||
}
|
||||
|
||||
CLSID_EXCEL97 = Types::Clsid.parse "{00020820-0000-0000-c000-000000000046}"
|
||||
CLSID_EXCEL95 = Types::Clsid.parse "{00020810-0000-0000-c000-000000000046}"
|
||||
CLSID_WORD97 = Types::Clsid.parse "{00020906-0000-0000-c000-000000000046}"
|
||||
CLSID_WORD95 = Types::Clsid.parse "{00020900-0000-0000-c000-000000000046}"
|
||||
|
||||
CLSID_MAP = {
|
||||
CLSID_EXCEL97 => :xls,
|
||||
CLSID_EXCEL95 => :xls,
|
||||
CLSID_WORD97 => :doc,
|
||||
CLSID_WORD95 => :doc
|
||||
}
|
||||
|
||||
MIME_TYPES = {
|
||||
:xls => 'application/vnd.ms-excel',
|
||||
:doc => 'application/msword',
|
||||
:ppt => 'application/vnd.ms-powerpoint',
|
||||
# not registered at IANA, but seems most common usage
|
||||
:msg => 'application/vnd.ms-outlook',
|
||||
# this is my default fallback option. also not registered at IANA.
|
||||
# file(1)'s default is application/msword, which is useless...
|
||||
nil => 'application/x-ole-storage'
|
||||
}
|
||||
|
||||
def initialize ole
|
||||
@ole = ole
|
||||
end
|
||||
|
||||
# i'm thinking of making file_format and mime_type available through
|
||||
# #[], #each, and #to_h also, as calculated meta data (not assignable)
|
||||
|
||||
def comp_obj
|
||||
return {} unless dirent = @ole.root["\001CompObj"]
|
||||
data = dirent.read
|
||||
# see - https://gnunet.org/svn/Extractor/doc/StarWrite_File_Format.html
|
||||
# compobj_version: 0x0001
|
||||
# byte_order: 0xffe
|
||||
# windows_version: 0x00000a03 (win31 apparently)
|
||||
# marker: 0xffffffff
|
||||
compobj_version, byte_order, windows_version, marker, clsid =
|
||||
data.unpack("vvVVa#{Types::Clsid::SIZE}")
|
||||
strings = []
|
||||
i = 28
|
||||
while i < data.length
|
||||
len = data[i, 4].unpack('V').first
|
||||
i += 4
|
||||
strings << data[i, len - 1]
|
||||
i += len
|
||||
end
|
||||
# in the unknown chunk, you usually see something like 'Word.Document.6'
|
||||
{:username => strings[0], :file_format => strings[1], :unknown => strings[2..-1]}
|
||||
end
|
||||
private :comp_obj
|
||||
|
||||
def file_format
|
||||
comp_obj[:file_format]
|
||||
end
|
||||
|
||||
def mime_type
|
||||
# based on the CompObj stream contents
|
||||
type = FORMAT_MAP[file_format]
|
||||
return MIME_TYPES[type] if type
|
||||
|
||||
# based on the root clsid
|
||||
type = CLSID_MAP[Types::Clsid.load(@ole.root.clsid)]
|
||||
return MIME_TYPES[type] if type
|
||||
|
||||
# fallback to heuristics
|
||||
has_file = Hash[*@ole.root.children.map { |d| [d.name.downcase, true] }.flatten]
|
||||
return MIME_TYPES[:msg] if has_file['__nameid_version1.0'] or has_file['__properties_version1.0']
|
||||
return MIME_TYPES[:doc] if has_file['worddocument'] or has_file['document']
|
||||
return MIME_TYPES[:xls] if has_file['workbook'] or has_file['book']
|
||||
|
||||
MIME_TYPES[nil]
|
||||
end
|
||||
|
||||
def [] key
|
||||
pair = Types::PropertySet::PROPERTY_MAP[key.to_s] or return nil
|
||||
file = FILE_MAP[pair.first] or return nil
|
||||
dirent = @ole.root[file] or return nil
|
||||
dirent.open { |io| return Types::PropertySet.new(io)[key] }
|
||||
end
|
||||
|
||||
def []= key, value
|
||||
raise NotImplementedError, 'meta data writes not implemented'
|
||||
end
|
||||
|
||||
def each(&block)
|
||||
FILE_MAP.values.each do |file|
|
||||
dirent = @ole.root[file] or next
|
||||
dirent.open { |io| Types::PropertySet.new(io).each(&block) }
|
||||
end
|
||||
end
|
||||
|
||||
def to_h
|
||||
inject({}) { |hash, (name, value)| hash.update name.to_sym => value }
|
||||
end
|
||||
|
||||
def method_missing name, *args, &block
|
||||
return super unless args.empty?
|
||||
pair = Types::PropertySet::PROPERTY_MAP[name.to_s] or return super
|
||||
self[name]
|
||||
end
|
||||
end
|
||||
|
||||
def meta_data
|
||||
@meta_data ||= MetaData.new(self)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
@ -1,40 +1,73 @@
|
|||
#! /usr/bin/ruby
|
||||
|
||||
#
|
||||
# A file with general support functions used by most files in the project.
|
||||
#
|
||||
# These are the only methods added to other classes.
|
||||
#
|
||||
|
||||
require 'logger'
|
||||
require 'stringio'
|
||||
require 'enumerator'
|
||||
|
||||
class String # :nodoc:
|
||||
# plural of String#index. returns all offsets of +string+. rename to indices?
|
||||
#
|
||||
# note that it doesn't check for overlapping values.
|
||||
def indexes string
|
||||
# in some ways i'm surprised that $~ works properly in this case...
|
||||
to_enum(:scan, /#{Regexp.quote string}/m).map { $~.begin 0 }
|
||||
end
|
||||
|
||||
def each_chunk size
|
||||
(length / size.to_f).ceil.times { |i| yield self[i * size, size] }
|
||||
end
|
||||
end
|
||||
|
||||
class File # :nodoc:
|
||||
# for consistency with StringIO and others. makes more sense than forcing
|
||||
# them to provide a #stat
|
||||
# for interface consistency with StringIO etc (rather than adding #stat
|
||||
# to them). used by RangesIO.
|
||||
def size
|
||||
stat.size
|
||||
end
|
||||
end
|
||||
|
||||
class Symbol # :nodoc:
|
||||
def to_proc
|
||||
proc { |a| a.send self }
|
||||
unless :x.respond_to? :to_proc
|
||||
def to_proc
|
||||
proc { |a| a.send self }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
module Enumerable # :nodoc:
|
||||
# 1.9 backport
|
||||
def group_by
|
||||
hash = Hash.new { |hash, key| hash[key] = [] }
|
||||
each { |item| hash[yield(item)] << item }
|
||||
hash
|
||||
unless [].respond_to? :group_by
|
||||
# 1.9 backport
|
||||
def group_by
|
||||
hash = Hash.new { |h, key| h[key] = [] }
|
||||
each { |item| hash[yield(item)] << item }
|
||||
hash
|
||||
end
|
||||
end
|
||||
|
||||
def sum initial=0
|
||||
inject(initial) { |a, b| a + b }
|
||||
unless [].respond_to? :sum
|
||||
def sum initial=0
|
||||
inject(initial) { |a, b| a + b }
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# move to support?
|
||||
class IO # :nodoc:
|
||||
# Copy data from IO-like object +src+, to +dst+
|
||||
def self.copy src, dst
|
||||
until src.eof?
|
||||
buf = src.read(4096)
|
||||
dst.write buf
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
class Logger # :nodoc:
|
||||
# A helper method for creating <tt>Logger</tt>s which produce call stack
|
||||
# A helper method for creating a +Logger+ which produce call stack
|
||||
# in their output
|
||||
def self.new_with_callstack logdev=STDERR
|
||||
log = Logger.new logdev
|
||||
|
@ -48,4 +81,176 @@ class Logger # :nodoc:
|
|||
end
|
||||
log
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Include this module into a class that defines #each_child. It should
|
||||
# maybe use #each instead, but its easier to be more specific, and use
|
||||
# an alias.
|
||||
#
|
||||
# I don't want to force the class to cache children (eg where children
|
||||
# are loaded on request in pst), because that forces the whole tree to
|
||||
# be loaded. So, the methods should only call #each_child once, and
|
||||
# breadth first iteration holds its own copy of the children around.
|
||||
#
|
||||
# Main methods are #recursive, and #to_tree
|
||||
module RecursivelyEnumerable # :nodoc:
|
||||
def each_recursive_depth_first(&block)
|
||||
each_child do |child|
|
||||
yield child
|
||||
if child.respond_to? :each_recursive_depth_first
|
||||
child.each_recursive_depth_first(&block)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# don't think this is actually a proper breadth first recursion. only first
|
||||
# level is breadth first.
|
||||
def each_recursive_breadth_first(&block)
|
||||
children = []
|
||||
each_child do |child|
|
||||
children << child if child.respond_to? :each_recursive_breadth_first
|
||||
yield child
|
||||
end
|
||||
children.each { |child| child.each_recursive_breadth_first(&block) }
|
||||
end
|
||||
|
||||
def each_recursive mode=:depth_first, &block
|
||||
# we always actually yield ourself (the tree root) before recursing
|
||||
yield self
|
||||
send "each_recursive_#{mode}", &block
|
||||
end
|
||||
|
||||
# the idea of this function, is to allow use of regular Enumerable methods
|
||||
# in a recursive fashion. eg:
|
||||
#
|
||||
# # just looks at top level children
|
||||
# root.find { |child| child.some_condition? }
|
||||
# # recurse into all children getting non-folders, breadth first
|
||||
# root.recursive(:breadth_first).select { |child| !child.folder? }
|
||||
# # just get everything
|
||||
# items = root.recursive.to_a
|
||||
#
|
||||
def recursive mode=:depth_first
|
||||
to_enum(:each_recursive, mode)
|
||||
end
|
||||
|
||||
# streams a "tree" form of the recursively enumerable structure to +io+, or
|
||||
# return a string form instead if +io+ is not specified.
|
||||
#
|
||||
# mostly a debugging aid. can specify a different block which will be called
|
||||
# to provide the string form for each node.
|
||||
def to_tree io='', &inspect
|
||||
inspect ||= :inspect.to_proc
|
||||
io << "- #{inspect[self]}\n"
|
||||
recurse = proc do |node, prefix|
|
||||
child = nil
|
||||
node.each_child do |next_child|
|
||||
if child
|
||||
io << "#{prefix}|- #{inspect[child]}\n"
|
||||
recurse.call child, prefix + '| '
|
||||
end
|
||||
child = next_child
|
||||
end if node.respond_to?(:each_child)
|
||||
if child
|
||||
io << "#{prefix}\\- #{inspect[child]}\n"
|
||||
recurse.call child, prefix + ' '
|
||||
end
|
||||
end
|
||||
recurse.call self, ' '
|
||||
io
|
||||
end
|
||||
end
|
||||
|
||||
# can include File::Constants
|
||||
class IO
|
||||
# this is for jruby
|
||||
include File::Constants unless defined?(RDONLY)
|
||||
|
||||
# nabbed from rubinius, and modified
|
||||
def self.parse_mode mode
|
||||
ret = 0
|
||||
|
||||
case mode[0, 1]
|
||||
when 'r'; ret |= RDONLY
|
||||
when 'w'; ret |= WRONLY | CREAT | TRUNC
|
||||
when 'a'; ret |= WRONLY | CREAT | APPEND
|
||||
else raise ArgumentError, "illegal access mode #{mode}"
|
||||
end
|
||||
|
||||
(1...mode.length).each do |i|
|
||||
case mode[i, 1]
|
||||
when '+'; ret = (ret & ~(RDONLY | WRONLY)) | RDWR
|
||||
when 'b'; ret |= Mode::BINARY
|
||||
else raise ArgumentError, "illegal access mode #{mode}"
|
||||
end
|
||||
end
|
||||
|
||||
ret
|
||||
end
|
||||
|
||||
class Mode
|
||||
# ruby 1.9 defines binary as 0, which isn't very helpful.
|
||||
# its 4 in rubinius. no longer using
|
||||
#
|
||||
# BINARY = 0x4 unless defined?(BINARY)
|
||||
#
|
||||
# for that reason, have my own constants module here
|
||||
module Constants
|
||||
include File::Constants
|
||||
BINARY = 0x4
|
||||
end
|
||||
|
||||
include Constants
|
||||
NAMES = %w[rdonly wronly rdwr creat trunc append binary]
|
||||
|
||||
attr_reader :flags
|
||||
def initialize flags
|
||||
flags = IO.parse_mode flags.to_str if flags.respond_to? :to_str
|
||||
raise ArgumentError, "invalid flags - #{flags.inspect}" unless Fixnum === flags
|
||||
@flags = flags
|
||||
end
|
||||
|
||||
def writeable?
|
||||
#(@flags & RDONLY) == 0
|
||||
(@flags & 0x3) != RDONLY
|
||||
end
|
||||
|
||||
def readable?
|
||||
(@flags & WRONLY) == 0
|
||||
end
|
||||
|
||||
def truncate?
|
||||
(@flags & TRUNC) != 0
|
||||
end
|
||||
|
||||
def append?
|
||||
(@flags & APPEND) != 0
|
||||
end
|
||||
|
||||
def create?
|
||||
(@flags & CREAT) != 0
|
||||
end
|
||||
|
||||
def binary?
|
||||
(@flags & BINARY) != 0
|
||||
end
|
||||
|
||||
=begin
|
||||
# revisit this
|
||||
def apply io
|
||||
if truncate?
|
||||
io.truncate 0
|
||||
elsif append?
|
||||
io.seek IO::SEEK_END, 0
|
||||
end
|
||||
end
|
||||
=end
|
||||
|
||||
def inspect
|
||||
names = NAMES.map { |name| name if (flags & Mode.const_get(name.upcase)) != 0 }
|
||||
names.unshift 'rdonly' if (flags & 0x3) == 0
|
||||
"#<#{self.class} #{names.compact * '|'}>"
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
|
|
|
@ -1,27 +1,2 @@
|
|||
require 'ole/base'
|
||||
|
||||
module Ole # :nodoc:
|
||||
# FIXME
|
||||
module Types
|
||||
# Parse two 32 bit time values into a DateTime
|
||||
# Time is stored as a high and low 32 bit value, comprising the
|
||||
# 100's of nanoseconds since 1st january 1601 (Epoch).
|
||||
# struct FILETIME. see eg http://msdn2.microsoft.com/en-us/library/ms724284.aspx
|
||||
def self.load_time str
|
||||
low, high = str.unpack 'L2'
|
||||
time = EPOCH + (high * (1 << 32) + low) * 1e-7 / 86400 rescue return
|
||||
# extra sanity check...
|
||||
unless (1800...2100) === time.year
|
||||
Log.warn "ignoring unlikely time value #{time.to_s}"
|
||||
return nil
|
||||
end
|
||||
time
|
||||
end
|
||||
|
||||
# turn a binary guid into something displayable.
|
||||
# this will probably become a proper class later
|
||||
def self.load_guid str
|
||||
"{%08x-%04x-%04x-%02x%02x-#{'%02x' * 6}}" % str.unpack('L S S CC C6')
|
||||
end
|
||||
end
|
||||
end
|
||||
require 'ole/types/base'
|
||||
require 'ole/types/property_set'
|
||||
|
|
|
@ -0,0 +1,251 @@
|
|||
require 'iconv'
|
||||
require 'date'
|
||||
|
||||
require 'ole/base'
|
||||
|
||||
module Ole # :nodoc:
|
||||
#
|
||||
# The Types module contains all the serialization and deserialization code for standard ole
|
||||
# types.
|
||||
#
|
||||
# It also defines all the variant type constants, and symbolic names.
|
||||
#
|
||||
module Types
|
||||
# for anything that we don't have serialization code for
|
||||
class Data < String
|
||||
def self.load str
|
||||
new str
|
||||
end
|
||||
|
||||
def self.dump str
|
||||
str.to_s
|
||||
end
|
||||
end
|
||||
|
||||
class Lpstr < String
|
||||
def self.load str
|
||||
# not sure if its always there, but there is often a trailing
|
||||
# null byte.
|
||||
new str.chomp(0.chr)
|
||||
end
|
||||
|
||||
def self.dump str
|
||||
# do i need to append the null byte?
|
||||
str.to_s
|
||||
end
|
||||
end
|
||||
|
||||
# for VT_LPWSTR
|
||||
class Lpwstr < String
|
||||
FROM_UTF16 = Iconv.new 'utf-8', 'utf-16le'
|
||||
TO_UTF16 = Iconv.new 'utf-16le', 'utf-8'
|
||||
|
||||
def self.load str
|
||||
new FROM_UTF16.iconv(str).chomp(0.chr)
|
||||
end
|
||||
|
||||
def self.dump str
|
||||
# need to append nulls?
|
||||
data = TO_UTF16.iconv str
|
||||
# not sure if this is the recommended way to do it, but I want to treat
|
||||
# the resulting utf16 data as regular bytes, not characters.
|
||||
data.force_encoding Encoding::US_ASCII if data.respond_to? :encoding
|
||||
data
|
||||
end
|
||||
end
|
||||
|
||||
# for VT_FILETIME
|
||||
class FileTime < DateTime
|
||||
SIZE = 8
|
||||
EPOCH = new 1601, 1, 1
|
||||
|
||||
# Create a +DateTime+ object from a struct +FILETIME+
|
||||
# (http://msdn2.microsoft.com/en-us/library/ms724284.aspx).
|
||||
#
|
||||
# Converts +str+ to two 32 bit time values, comprising the high and low 32 bits of
|
||||
# the 100's of nanoseconds since 1st january 1601 (Epoch).
|
||||
def self.load str
|
||||
low, high = str.to_s.unpack 'V2'
|
||||
# we ignore these, without even warning about it
|
||||
return nil if low == 0 and high == 0
|
||||
# switched to rational, and fixed the off by 1 second error i sometimes got.
|
||||
# time = EPOCH + (high * (1 << 32) + low) / 1e7 / 86400 rescue return
|
||||
# use const_get to ensure we can return anything which subclasses this (VT_DATE?)
|
||||
const_get('EPOCH') + Rational(high * (1 << 32) + low, 1e7.to_i * 86400) rescue return
|
||||
# extra sanity check...
|
||||
#unless (1800...2100) === time.year
|
||||
# Log.warn "ignoring unlikely time value #{time.to_s}"
|
||||
# return nil
|
||||
#end
|
||||
#time
|
||||
end
|
||||
|
||||
# +time+ should be able to be either a Time, Date, or DateTime.
|
||||
def self.dump time
|
||||
# i think i'll convert whatever i get to be a datetime, because of
|
||||
# the covered range.
|
||||
return 0.chr * SIZE unless time
|
||||
time = time.send(:to_datetime) if Time === time
|
||||
# don't bother to use const_get here
|
||||
bignum = (time - EPOCH) * 86400 * 1e7.to_i
|
||||
high, low = bignum.divmod 1 << 32
|
||||
[low, high].pack 'V2'
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class} #{to_s}>"
|
||||
end
|
||||
end
|
||||
|
||||
# for VT_CLSID
|
||||
# Unlike most of the other conversions, the Guid's are serialized/deserialized by actually
|
||||
# doing nothing! (eg, _load & _dump are null ops)
|
||||
# Rather, its just a string with a different inspect string, and it includes a
|
||||
# helper method for creating a Guid from that readable form (#format).
|
||||
class Clsid < String
|
||||
SIZE = 16
|
||||
PACK = 'V v v CC C6'
|
||||
|
||||
def self.load str
|
||||
new str.to_s
|
||||
end
|
||||
|
||||
def self.dump guid
|
||||
return 0.chr * SIZE unless guid
|
||||
# allow use of plain strings in place of guids.
|
||||
guid['-'] ? parse(guid) : guid
|
||||
end
|
||||
|
||||
def self.parse str
|
||||
vals = str.scan(/[a-f\d]+/i).map(&:hex)
|
||||
if vals.length == 5
|
||||
# this is pretty ugly
|
||||
vals[3] = ('%04x' % vals[3]).scan(/../).map(&:hex)
|
||||
vals[4] = ('%012x' % vals[4]).scan(/../).map(&:hex)
|
||||
guid = new vals.flatten.pack(PACK)
|
||||
return guid if guid.format.delete('{}') == str.downcase.delete('{}')
|
||||
end
|
||||
raise ArgumentError, 'invalid guid - %p' % str
|
||||
end
|
||||
|
||||
def format
|
||||
"%08x-%04x-%04x-%02x%02x-#{'%02x' * 6}" % unpack(PACK)
|
||||
end
|
||||
|
||||
def inspect
|
||||
"#<#{self.class}:{#{format}}>"
|
||||
end
|
||||
end
|
||||
|
||||
#
|
||||
# The OLE variant types, extracted from
|
||||
# http://www.marin.clara.net/COM/variant_type_definitions.htm.
|
||||
#
|
||||
# A subset is also in WIN32OLE::VARIANT, but its not cross platform (obviously).
|
||||
#
|
||||
# Use like:
|
||||
#
|
||||
# p Ole::Types::Variant::NAMES[0x001f] => 'VT_LPWSTR'
|
||||
# p Ole::Types::VT_DATE # => 7
|
||||
#
|
||||
# The serialization / deserialization functions should be fixed to make it easier
|
||||
# to work with. like
|
||||
#
|
||||
# Ole::Types.from_str(VT_DATE, data) # and
|
||||
# Ole::Types.to_str(VT_DATE, data)
|
||||
#
|
||||
# Or similar, rather than having to do VT_* <=> ad hoc class name etc as it is
|
||||
# currently.
|
||||
#
|
||||
module Variant
|
||||
NAMES = {
|
||||
0x0000 => 'VT_EMPTY',
|
||||
0x0001 => 'VT_NULL',
|
||||
0x0002 => 'VT_I2',
|
||||
0x0003 => 'VT_I4',
|
||||
0x0004 => 'VT_R4',
|
||||
0x0005 => 'VT_R8',
|
||||
0x0006 => 'VT_CY',
|
||||
0x0007 => 'VT_DATE',
|
||||
0x0008 => 'VT_BSTR',
|
||||
0x0009 => 'VT_DISPATCH',
|
||||
0x000a => 'VT_ERROR',
|
||||
0x000b => 'VT_BOOL',
|
||||
0x000c => 'VT_VARIANT',
|
||||
0x000d => 'VT_UNKNOWN',
|
||||
0x000e => 'VT_DECIMAL',
|
||||
0x0010 => 'VT_I1',
|
||||
0x0011 => 'VT_UI1',
|
||||
0x0012 => 'VT_UI2',
|
||||
0x0013 => 'VT_UI4',
|
||||
0x0014 => 'VT_I8',
|
||||
0x0015 => 'VT_UI8',
|
||||
0x0016 => 'VT_INT',
|
||||
0x0017 => 'VT_UINT',
|
||||
0x0018 => 'VT_VOID',
|
||||
0x0019 => 'VT_HRESULT',
|
||||
0x001a => 'VT_PTR',
|
||||
0x001b => 'VT_SAFEARRAY',
|
||||
0x001c => 'VT_CARRAY',
|
||||
0x001d => 'VT_USERDEFINED',
|
||||
0x001e => 'VT_LPSTR',
|
||||
0x001f => 'VT_LPWSTR',
|
||||
0x0040 => 'VT_FILETIME',
|
||||
0x0041 => 'VT_BLOB',
|
||||
0x0042 => 'VT_STREAM',
|
||||
0x0043 => 'VT_STORAGE',
|
||||
0x0044 => 'VT_STREAMED_OBJECT',
|
||||
0x0045 => 'VT_STORED_OBJECT',
|
||||
0x0046 => 'VT_BLOB_OBJECT',
|
||||
0x0047 => 'VT_CF',
|
||||
0x0048 => 'VT_CLSID',
|
||||
0x0fff => 'VT_ILLEGALMASKED',
|
||||
0x0fff => 'VT_TYPEMASK',
|
||||
0x1000 => 'VT_VECTOR',
|
||||
0x2000 => 'VT_ARRAY',
|
||||
0x4000 => 'VT_BYREF',
|
||||
0x8000 => 'VT_RESERVED',
|
||||
0xffff => 'VT_ILLEGAL'
|
||||
}
|
||||
|
||||
CLASS_MAP = {
|
||||
# haven't seen one of these. wonder if its same as FILETIME?
|
||||
#'VT_DATE' => ?,
|
||||
'VT_LPSTR' => Lpstr,
|
||||
'VT_LPWSTR' => Lpwstr,
|
||||
'VT_FILETIME' => FileTime,
|
||||
'VT_CLSID' => Clsid
|
||||
}
|
||||
|
||||
module Constants
|
||||
NAMES.each { |num, name| const_set name, num }
|
||||
end
|
||||
|
||||
def self.load type, str
|
||||
type = NAMES[type] or raise ArgumentError, 'unknown ole type - 0x%04x' % type
|
||||
(CLASS_MAP[type] || Data).load str
|
||||
end
|
||||
|
||||
def self.dump type, variant
|
||||
type = NAMES[type] or raise ArgumentError, 'unknown ole type - 0x%04x' % type
|
||||
(CLASS_MAP[type] || Data).dump variant
|
||||
end
|
||||
end
|
||||
|
||||
include Variant::Constants
|
||||
|
||||
# deprecated aliases, kept mostly for the benefit of ruby-msg, until
|
||||
# i release a new version.
|
||||
def self.load_guid str
|
||||
Variant.load VT_CLSID, str
|
||||
end
|
||||
|
||||
def self.load_time str
|
||||
Variant.load VT_FILETIME, str
|
||||
end
|
||||
|
||||
FROM_UTF16 = Lpwstr::FROM_UTF16
|
||||
TO_UTF16 = Lpwstr::TO_UTF16
|
||||
end
|
||||
end
|
||||
|
|
@ -0,0 +1,165 @@
|
|||
require 'ole/types'
|
||||
require 'yaml'
|
||||
|
||||
module Ole
|
||||
module Types
|
||||
#
|
||||
# The PropertySet class currently supports readonly access to the properties
|
||||
# serialized in "property set" streams, such as the file "\005SummaryInformation",
|
||||
# in OLE files.
|
||||
#
|
||||
# Think it has its roots in MFC property set serialization.
|
||||
#
|
||||
# See http://poi.apache.org/hpsf/internals.html for details
|
||||
#
|
||||
class PropertySet
|
||||
HEADER_SIZE = 28
|
||||
HEADER_PACK = "vvVa#{Clsid::SIZE}V"
|
||||
OS_MAP = {
|
||||
0 => :win16,
|
||||
1 => :mac,
|
||||
2 => :win32,
|
||||
0x20001 => :ooffice, # open office on linux...
|
||||
}
|
||||
|
||||
# define a smattering of the property set guids.
|
||||
DATA = YAML.load_file(File.dirname(__FILE__) + '/../../../data/propids.yaml').
|
||||
inject({}) { |hash, (key, value)| hash.update Clsid.parse(key) => value }
|
||||
|
||||
# create an inverted map of names to guid/key pairs
|
||||
PROPERTY_MAP = DATA.inject({}) do |h1, (guid, data)|
|
||||
data[1].inject(h1) { |h2, (id, name)| h2.update name => [guid, id] }
|
||||
end
|
||||
|
||||
module Constants
|
||||
DATA.each { |guid, (name, map)| const_set name, guid }
|
||||
end
|
||||
|
||||
include Constants
|
||||
include Enumerable
|
||||
|
||||
class Section
|
||||
include Variant::Constants
|
||||
include Enumerable
|
||||
|
||||
SIZE = Clsid::SIZE + 4
|
||||
PACK = "a#{Clsid::SIZE}v"
|
||||
|
||||
attr_accessor :guid, :offset
|
||||
attr_reader :length
|
||||
|
||||
def initialize str, property_set
|
||||
@property_set = property_set
|
||||
@guid, @offset = str.unpack PACK
|
||||
self.guid = Clsid.load guid
|
||||
load_header
|
||||
end
|
||||
|
||||
def io
|
||||
@property_set.io
|
||||
end
|
||||
|
||||
def load_header
|
||||
io.seek offset
|
||||
@byte_size, @length = io.read(8).unpack 'V2'
|
||||
end
|
||||
|
||||
def [] key
|
||||
each_raw do |id, property_offset|
|
||||
return read_property(property_offset).last if key == id
|
||||
end
|
||||
nil
|
||||
end
|
||||
|
||||
def []= key, value
|
||||
raise NotImplementedError, 'section writes not yet implemented'
|
||||
end
|
||||
|
||||
def each
|
||||
each_raw do |id, property_offset|
|
||||
yield id, read_property(property_offset).last
|
||||
end
|
||||
end
|
||||
|
||||
private
|
||||
|
||||
def each_raw
|
||||
io.seek offset + 8
|
||||
io.read(length * 8).each_chunk(8) { |str| yield(*str.unpack('V2')) }
|
||||
end
|
||||
|
||||
def read_property property_offset
|
||||
io.seek offset + property_offset
|
||||
type, value = io.read(8).unpack('V2')
|
||||
# is the method of serialization here custom?
|
||||
case type
|
||||
when VT_LPSTR, VT_LPWSTR
|
||||
value = Variant.load type, io.read(value)
|
||||
# ....
|
||||
end
|
||||
[type, value]
|
||||
end
|
||||
end
|
||||
|
||||
attr_reader :io, :signature, :unknown, :os, :guid, :sections
|
||||
|
||||
def initialize io
|
||||
@io = io
|
||||
load_header io.read(HEADER_SIZE)
|
||||
load_section_list io.read(@num_sections * Section::SIZE)
|
||||
# expect no gap between last section and start of data.
|
||||
#Log.warn "gap between section list and property data" unless io.pos == @sections.map(&:offset).min
|
||||
end
|
||||
|
||||
def load_header str
|
||||
@signature, @unknown, @os_id, @guid, @num_sections = str.unpack HEADER_PACK
|
||||
# should i check that unknown == 0? it usually is. so is the guid actually
|
||||
@guid = Clsid.load @guid
|
||||
@os = OS_MAP[@os_id] || Log.warn("unknown operating system id #{@os_id}")
|
||||
end
|
||||
|
||||
def load_section_list str
|
||||
@sections = str.to_enum(:each_chunk, Section::SIZE).map { |s| Section.new s, self }
|
||||
end
|
||||
|
||||
def [] key
|
||||
pair = PROPERTY_MAP[key.to_s] or return nil
|
||||
section = @sections.find { |s| s.guid == pair.first } or return nil
|
||||
section[pair.last]
|
||||
end
|
||||
|
||||
def []= key, value
|
||||
pair = PROPERTY_MAP[key.to_s] or return nil
|
||||
section = @sections.find { |s| s.guid == pair.first } or return nil
|
||||
section[pair.last] = value
|
||||
end
|
||||
|
||||
def method_missing name, *args, &block
|
||||
if name.to_s =~ /(.*)=$/
|
||||
return super unless args.length == 1
|
||||
return super unless PROPERTY_MAP[$1]
|
||||
self[$1] = args.first
|
||||
else
|
||||
return super unless args.length == 0
|
||||
return super unless PROPERTY_MAP[name.to_s]
|
||||
self[name]
|
||||
end
|
||||
end
|
||||
|
||||
def each
|
||||
@sections.each do |section|
|
||||
next unless pair = DATA[section.guid]
|
||||
map = pair.last
|
||||
section.each do |id, value|
|
||||
name = map[id] or next
|
||||
yield name, value
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
def to_h
|
||||
inject({}) { |hash, (name, value)| hash.update name.to_sym => value }
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
Loading…
Reference in New Issue