root / trunk / lib / coderay / tokens.rb
History | View | Annotate | Download (9.3 KB)
| 1 | module CodeRay |
|---|---|
| 2 | |
| 3 | # GZip library for writing and reading token dumps.
|
| 4 | autoload :GZip, 'coderay/helpers/gzip' |
| 5 | |
| 6 | # = Tokens TODO: Rewrite!
|
| 7 | #
|
| 8 | # The Tokens class represents a list of tokens returnd from
|
| 9 | # a Scanner.
|
| 10 | #
|
| 11 | # A token is not a special object, just a two-element Array
|
| 12 | # consisting of
|
| 13 | # * the _token_ _text_ (the original source of the token in a String) or
|
| 14 | # a _token_ _action_ (begin_group, end_group, begin_line, end_line)
|
| 15 | # * the _token_ _kind_ (a Symbol representing the type of the token)
|
| 16 | #
|
| 17 | # A token looks like this:
|
| 18 | #
|
| 19 | # ['# It looks like this', :comment]
|
| 20 | # ['3.1415926', :float]
|
| 21 | # ['$^', :error]
|
| 22 | #
|
| 23 | # Some scanners also yield sub-tokens, represented by special
|
| 24 | # token actions, namely begin_group and end_group.
|
| 25 | #
|
| 26 | # The Ruby scanner, for example, splits "a string" into:
|
| 27 | #
|
| 28 | # [
|
| 29 | # [:begin_group, :string],
|
| 30 | # ['"', :delimiter],
|
| 31 | # ['a string', :content],
|
| 32 | # ['"', :delimiter],
|
| 33 | # [:end_group, :string]
|
| 34 | # ]
|
| 35 | #
|
| 36 | # Tokens is the interface between Scanners and Encoders:
|
| 37 | # The input is split and saved into a Tokens object. The Encoder
|
| 38 | # then builds the output from this object.
|
| 39 | #
|
| 40 | # Thus, the syntax below becomes clear:
|
| 41 | #
|
| 42 | # CodeRay.scan('price = 2.59', :ruby).html
|
| 43 | # # the Tokens object is here -------^
|
| 44 | #
|
| 45 | # See how small it is? ;)
|
| 46 | #
|
| 47 | # Tokens gives you the power to handle pre-scanned code very easily:
|
| 48 | # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
|
| 49 | # that you put in your DB.
|
| 50 | #
|
| 51 | # It also allows you to generate tokens directly (without using a scanner),
|
| 52 | # to load them from a file, and still use any Encoder that CodeRay provides.
|
| 53 | class Tokens < Array |
| 54 | |
| 55 | autoload :AbbreviationForKind, 'coderay/token_kinds' |
| 56 | |
| 57 | # The Scanner instance that created the tokens.
|
| 58 | attr_accessor :scanner
|
| 59 | |
| 60 | # Encode the tokens using encoder.
|
| 61 | #
|
| 62 | # encoder can be
|
| 63 | # * a symbol like :html oder :statistic
|
| 64 | # * an Encoder class
|
| 65 | # * an Encoder object
|
| 66 | #
|
| 67 | # options are passed to the encoder.
|
| 68 | def encode encoder, options = {} |
| 69 | unless encoder.is_a? Encoders::Encoder |
| 70 | unless encoder.is_a? Class |
| 71 | encoder_class = Encoders[encoder]
|
| 72 | end
|
| 73 | encoder = encoder_class.new options |
| 74 | end
|
| 75 | encoder.encode_tokens self, options
|
| 76 | end
|
| 77 | |
| 78 | # Turn into a string using Encoders::Text.
|
| 79 | #
|
| 80 | # +options+ are passed to the encoder if given.
|
| 81 | def to_s |
| 82 | encode Encoders::Encoder.new |
| 83 | end
|
| 84 | |
| 85 | # Redirects unknown methods to encoder calls.
|
| 86 | #
|
| 87 | # For example, if you call +tokens.html+, the HTML encoder
|
| 88 | # is used to highlight the tokens.
|
| 89 | def method_missing meth, options = {} |
| 90 | encode_with meth, options |
| 91 | rescue PluginHost::PluginNotFound |
| 92 | super
|
| 93 | end
|
| 94 | |
| 95 | def encode_with encoder, options = {} |
| 96 | Encoders[encoder].new(options).encode_tokens self |
| 97 | end
|
| 98 | |
| 99 | # Returns the tokens compressed by joining consecutive
|
| 100 | # tokens of the same kind.
|
| 101 | #
|
| 102 | # This can not be undone, but should yield the same output
|
| 103 | # in most Encoders. It basically makes the output smaller.
|
| 104 | #
|
| 105 | # Combined with dump, it saves space for the cost of time.
|
| 106 | #
|
| 107 | # If the scanner is written carefully, this is not required -
|
| 108 | # for example, consecutive //-comment lines could already be
|
| 109 | # joined in one comment token by the Scanner.
|
| 110 | def optimize |
| 111 | raise NotImplementedError, 'Tokens#optimize needs to be rewritten.' |
| 112 | last_kind = last_text = nil
|
| 113 | new = self.class.new
|
| 114 | for text, kind in self |
| 115 | if text.is_a? String |
| 116 | if kind == last_kind
|
| 117 | last_text << text |
| 118 | else
|
| 119 | new << [last_text, last_kind] if last_kind
|
| 120 | last_text = text |
| 121 | last_kind = kind |
| 122 | end
|
| 123 | else
|
| 124 | new << [last_text, last_kind] if last_kind
|
| 125 | last_kind = last_text = nil
|
| 126 | new << [text, kind] |
| 127 | end
|
| 128 | end
|
| 129 | new << [last_text, last_kind] if last_kind
|
| 130 | new |
| 131 | end
|
| 132 | |
| 133 | # Compact the object itself; see optimize.
|
| 134 | def optimize! |
| 135 | replace optimize |
| 136 | end
|
| 137 | |
| 138 | # Ensure that all begin_group tokens have a correspondent end_group.
|
| 139 | #
|
| 140 | # TODO: Test this!
|
| 141 | def fix |
| 142 | raise NotImplementedError, 'Tokens#fix needs to be rewritten.' |
| 143 | tokens = self.class.new
|
| 144 | # Check token nesting using a stack of kinds.
|
| 145 | opened = [] |
| 146 | for type, kind in self |
| 147 | case type
|
| 148 | when :begin_group |
| 149 | opened.push [:begin_group, kind]
|
| 150 | when :begin_line |
| 151 | opened.push [:end_line, kind]
|
| 152 | when :end_group, :end_line |
| 153 | expected = opened.pop |
| 154 | if [type, kind] != expected
|
| 155 | # Unexpected end; decide what to do based on the kind:
|
| 156 | # - token was never opened: delete the end (just skip it)
|
| 157 | next unless opened.rindex expected |
| 158 | # - token was opened earlier: also close tokens in between
|
| 159 | tokens << token until (token = opened.pop) == expected
|
| 160 | end
|
| 161 | end
|
| 162 | tokens << [type, kind] |
| 163 | end
|
| 164 | # Close remaining opened tokens
|
| 165 | tokens << token while token = opened.pop
|
| 166 | tokens |
| 167 | end
|
| 168 | |
| 169 | def fix! |
| 170 | replace fix |
| 171 | end
|
| 172 | |
| 173 | # TODO: Scanner#split_into_lines
|
| 174 | #
|
| 175 | # Makes sure that:
|
| 176 | # - newlines are single tokens
|
| 177 | # (which means all other token are single-line)
|
| 178 | # - there are no open tokens at the end the line
|
| 179 | #
|
| 180 | # This makes it simple for encoders that work line-oriented,
|
| 181 | # like HTML with list-style numeration.
|
| 182 | def split_into_lines |
| 183 | raise NotImplementedError
|
| 184 | end
|
| 185 | |
| 186 | def split_into_lines! |
| 187 | replace split_into_lines |
| 188 | end
|
| 189 | |
| 190 | # Split the tokens into parts of the given +sizes+.
|
| 191 | #
|
| 192 | # The result will be an Array of Tokens objects. The parts have
|
| 193 | # the text size specified by the parameter. In addition, each
|
| 194 | # part closes all opened tokens. This is useful to insert tokens
|
| 195 | # betweem them.
|
| 196 | #
|
| 197 | # This method is used by @Scanner#tokenize@ when called with an Array
|
| 198 | # of source strings. The Diff encoder uses it for inline highlighting.
|
| 199 | def split_into_parts *sizes |
| 200 | parts = [] |
| 201 | opened = [] |
| 202 | content = nil
|
| 203 | part = Tokens.new
|
| 204 | part_size = 0
|
| 205 | size = sizes.first |
| 206 | i = 0
|
| 207 | for item in self |
| 208 | case content
|
| 209 | when nil |
| 210 | content = item |
| 211 | when String |
| 212 | if size && part_size + content.size > size # token must be cut |
| 213 | if part_size < size # some part of the token goes into this part |
| 214 | content = content.dup # content may no be safe to change
|
| 215 | part << content.slice!(0, size - part_size) << item
|
| 216 | end
|
| 217 | # close all open groups and lines...
|
| 218 | closing = opened.reverse.flatten.map do |content_or_kind|
|
| 219 | case content_or_kind
|
| 220 | when :begin_group |
| 221 | :end_group
|
| 222 | when :begin_line |
| 223 | :end_line
|
| 224 | else
|
| 225 | content_or_kind |
| 226 | end
|
| 227 | end
|
| 228 | parts << part.concat(closing) |
| 229 | part = Tokens.new
|
| 230 | # ...and open them again.
|
| 231 | part.concat opened.flatten |
| 232 | part_size = 0
|
| 233 | size = sizes[i += 1]
|
| 234 | redo unless content.empty? |
| 235 | else
|
| 236 | part << content << item |
| 237 | part_size += content.size |
| 238 | end
|
| 239 | content = nil
|
| 240 | when Symbol |
| 241 | case content
|
| 242 | when :begin_group, :begin_line |
| 243 | opened << [content, item] |
| 244 | when :end_group, :end_line |
| 245 | opened.pop |
| 246 | else
|
| 247 | raise 'Unknown token action: %p, kind = %p' % [content, item]
|
| 248 | end
|
| 249 | part << content << item |
| 250 | content = nil
|
| 251 | else
|
| 252 | raise 'else case reached'
|
| 253 | end
|
| 254 | end
|
| 255 | parts << part |
| 256 | parts << Tokens.new while parts.size < sizes.size |
| 257 | parts |
| 258 | end
|
| 259 | |
| 260 | # Dumps the object into a String that can be saved
|
| 261 | # in files or databases.
|
| 262 | #
|
| 263 | # The dump is created with Marshal.dump;
|
| 264 | # In addition, it is gzipped using GZip.gzip.
|
| 265 | #
|
| 266 | # The returned String object includes Undumping
|
| 267 | # so it has an #undump method. See Tokens.load.
|
| 268 | #
|
| 269 | # You can configure the level of compression,
|
| 270 | # but the default value 7 should be what you want
|
| 271 | # in most cases as it is a good compromise between
|
| 272 | # speed and compression rate.
|
| 273 | #
|
| 274 | # See GZip module.
|
| 275 | def dump gzip_level = 7 |
| 276 | dump = Marshal.dump self |
| 277 | dump = GZip.gzip dump, gzip_level
|
| 278 | dump.extend Undumping
|
| 279 | end
|
| 280 | |
| 281 | # Return the actual number of tokens.
|
| 282 | def count |
| 283 | size / 2
|
| 284 | end
|
| 285 | |
| 286 | # Include this module to give an object an #undump
|
| 287 | # method.
|
| 288 | #
|
| 289 | # The string returned by Tokens.dump includes Undumping.
|
| 290 | module Undumping |
| 291 | # Calls Tokens.load with itself.
|
| 292 | def undump |
| 293 | Tokens.load self |
| 294 | end
|
| 295 | end
|
| 296 | |
| 297 | # Undump the object using Marshal.load, then
|
| 298 | # unzip it using GZip.gunzip.
|
| 299 | #
|
| 300 | # The result is commonly a Tokens object, but
|
| 301 | # this is not guaranteed.
|
| 302 | def Tokens.load dump |
| 303 | dump = GZip.gunzip dump
|
| 304 | @dump = Marshal.load dump |
| 305 | end
|
| 306 | |
| 307 | alias text_token push
|
| 308 | def begin_group kind; push :begin_group, kind end |
| 309 | def end_group kind; push :end_group, kind end |
| 310 | def begin_line kind; push :begin_line, kind end |
| 311 | def end_line kind; push :end_line, kind end |
| 312 | alias tokens concat
|
| 313 | |
| 314 | end
|
| 315 | |
| 316 | end
|