Statistics
| Revision:

root / trunk / lib / coderay / tokens.rb

History | View | Annotate | Download (9.3 KB)

1
module CodeRay
2
3
  # GZip library for writing and reading token dumps.
4
  autoload :GZip, 'coderay/helpers/gzip'
5
  
6
  # = Tokens  TODO: Rewrite!
7
  #
8
  # The Tokens class represents a list of tokens returnd from
9
  # a Scanner.
10
  #
11
  # A token is not a special object, just a two-element Array
12
  # consisting of
13
  # * the _token_ _text_ (the original source of the token in a String) or
14
  #   a _token_ _action_ (begin_group, end_group, begin_line, end_line)
15
  # * the _token_ _kind_ (a Symbol representing the type of the token)
16
  #
17
  # A token looks like this:
18
  #
19
  #   ['# It looks like this', :comment]
20
  #   ['3.1415926', :float]
21
  #   ['$^', :error]
22
  #
23
  # Some scanners also yield sub-tokens, represented by special
24
  # token actions, namely begin_group and end_group.
25
  #
26
  # The Ruby scanner, for example, splits "a string" into:
27
  #
28
  #  [
29
  #   [:begin_group, :string],
30
  #   ['"', :delimiter],
31
  #   ['a string', :content],
32
  #   ['"', :delimiter],
33
  #   [:end_group, :string]
34
  #  ]
35
  #
36
  # Tokens is the interface between Scanners and Encoders:
37
  # The input is split and saved into a Tokens object. The Encoder
38
  # then builds the output from this object.
39
  #
40
  # Thus, the syntax below becomes clear:
41
  #
42
  #   CodeRay.scan('price = 2.59', :ruby).html
43
  #   # the Tokens object is here -------^
44
  #
45
  # See how small it is? ;)
46
  #
47
  # Tokens gives you the power to handle pre-scanned code very easily:
48
  # You can convert it to a webpage, a YAML file, or dump it into a gzip'ed string
49
  # that you put in your DB.
50
  # 
51
  # It also allows you to generate tokens directly (without using a scanner),
52
  # to load them from a file, and still use any Encoder that CodeRay provides.
53
  class Tokens < Array
54
    
55
    autoload :AbbreviationForKind, 'coderay/token_kinds'
56
    
57
    # The Scanner instance that created the tokens.
58
    attr_accessor :scanner
59
    
60
    # Encode the tokens using encoder.
61
    #
62
    # encoder can be
63
    # * a symbol like :html oder :statistic
64
    # * an Encoder class
65
    # * an Encoder object
66
    #
67
    # options are passed to the encoder.
68
    def encode encoder, options = {}
69
      unless encoder.is_a? Encoders::Encoder
70
        unless encoder.is_a? Class
71
          encoder_class = Encoders[encoder]
72
        end
73
        encoder = encoder_class.new options
74
      end
75
      encoder.encode_tokens self, options
76
    end
77
78
    # Turn into a string using Encoders::Text.
79
    #
80
    # +options+ are passed to the encoder if given.
81
    def to_s
82
      encode Encoders::Encoder.new
83
    end
84
85
    # Redirects unknown methods to encoder calls.
86
    #
87
    # For example, if you call +tokens.html+, the HTML encoder
88
    # is used to highlight the tokens.
89
    def method_missing meth, options = {}
90
      encode_with meth, options
91
    rescue PluginHost::PluginNotFound
92
      super
93
    end
94
    
95
    def encode_with encoder, options = {}
96
      Encoders[encoder].new(options).encode_tokens self
97
    end
98
    
99
    # Returns the tokens compressed by joining consecutive
100
    # tokens of the same kind.
101
    #
102
    # This can not be undone, but should yield the same output
103
    # in most Encoders.  It basically makes the output smaller.
104
    #
105
    # Combined with dump, it saves space for the cost of time.
106
    #
107
    # If the scanner is written carefully, this is not required -
108
    # for example, consecutive //-comment lines could already be
109
    # joined in one comment token by the Scanner.
110
    def optimize
111
      raise NotImplementedError, 'Tokens#optimize needs to be rewritten.'
112
      last_kind = last_text = nil
113
      new = self.class.new
114
      for text, kind in self
115
        if text.is_a? String
116
          if kind == last_kind
117
            last_text << text
118
          else
119
            new << [last_text, last_kind] if last_kind
120
            last_text = text
121
            last_kind = kind
122
          end
123
        else
124
          new << [last_text, last_kind] if last_kind
125
          last_kind = last_text = nil
126
          new << [text, kind]
127
        end
128
      end
129
      new << [last_text, last_kind] if last_kind
130
      new
131
    end
132
133
    # Compact the object itself; see optimize.
134
    def optimize!
135
      replace optimize
136
    end
137
    
138
    # Ensure that all begin_group tokens have a correspondent end_group.
139
    #
140
    # TODO: Test this!
141
    def fix
142
      raise NotImplementedError, 'Tokens#fix needs to be rewritten.'
143
      tokens = self.class.new
144
      # Check token nesting using a stack of kinds.
145
      opened = []
146
      for type, kind in self
147
        case type
148
        when :begin_group
149
          opened.push [:begin_group, kind]
150
        when :begin_line
151
          opened.push [:end_line, kind]
152
        when :end_group, :end_line
153
          expected = opened.pop
154
          if [type, kind] != expected
155
            # Unexpected end; decide what to do based on the kind:
156
            # - token was never opened: delete the end (just skip it)
157
            next unless opened.rindex expected
158
            # - token was opened earlier: also close tokens in between
159
            tokens << token until (token = opened.pop) == expected
160
          end
161
        end
162
        tokens << [type, kind]
163
      end
164
      # Close remaining opened tokens
165
      tokens << token while token = opened.pop
166
      tokens
167
    end
168
    
169
    def fix!
170
      replace fix
171
    end
172
    
173
    # TODO: Scanner#split_into_lines
174
    # 
175
    # Makes sure that:
176
    # - newlines are single tokens
177
    #   (which means all other token are single-line)
178
    # - there are no open tokens at the end the line
179
    #
180
    # This makes it simple for encoders that work line-oriented,
181
    # like HTML with list-style numeration.
182
    def split_into_lines
183
      raise NotImplementedError
184
    end
185
186
    def split_into_lines!
187
      replace split_into_lines
188
    end
189
    
190
    # Split the tokens into parts of the given +sizes+.
191
    # 
192
    # The result will be an Array of Tokens objects. The parts have
193
    # the text size specified by the parameter. In addition, each
194
    # part closes all opened tokens. This is useful to insert tokens
195
    # betweem them.
196
    # 
197
    # This method is used by @Scanner#tokenize@ when called with an Array
198
    # of source strings. The Diff encoder uses it for inline highlighting.
199
    def split_into_parts *sizes
200
      parts = []
201
      opened = []
202
      content = nil
203
      part = Tokens.new
204
      part_size = 0
205
      size = sizes.first
206
      i = 0
207
      for item in self
208
        case content
209
        when nil
210
          content = item
211
        when String
212
          if size && part_size + content.size > size  # token must be cut
213
            if part_size < size  # some part of the token goes into this part
214
              content = content.dup  # content may no be safe to change
215
              part << content.slice!(0, size - part_size) << item
216
            end
217
            # close all open groups and lines...
218
            closing = opened.reverse.flatten.map do |content_or_kind|
219
              case content_or_kind
220
              when :begin_group
221
                :end_group
222
              when :begin_line
223
                :end_line
224
              else
225
                content_or_kind
226
              end
227
            end
228
            parts << part.concat(closing)
229
            part = Tokens.new
230
            # ...and open them again.
231
            part.concat opened.flatten
232
            part_size = 0
233
            size = sizes[i += 1]
234
            redo unless content.empty?
235
          else
236
            part << content << item
237
            part_size += content.size
238
          end
239
          content = nil
240
        when Symbol
241
          case content
242
          when :begin_group, :begin_line
243
            opened << [content, item]
244
          when :end_group, :end_line
245
            opened.pop
246
          else
247
            raise 'Unknown token action: %p, kind = %p' % [content, item]
248
          end
249
          part << content << item
250
          content = nil
251
        else
252
          raise 'else case reached'
253
        end
254
      end
255
      parts << part
256
      parts << Tokens.new while parts.size < sizes.size
257
      parts
258
    end
259
    
260
    # Dumps the object into a String that can be saved
261
    # in files or databases.
262
    #
263
    # The dump is created with Marshal.dump;
264
    # In addition, it is gzipped using GZip.gzip.
265
    #
266
    # The returned String object includes Undumping
267
    # so it has an #undump method. See Tokens.load.
268
    #
269
    # You can configure the level of compression,
270
    # but the default value 7 should be what you want
271
    # in most cases as it is a good compromise between
272
    # speed and compression rate.
273
    #
274
    # See GZip module.
275
    def dump gzip_level = 7
276
      dump = Marshal.dump self
277
      dump = GZip.gzip dump, gzip_level
278
      dump.extend Undumping
279
    end
280
    
281
    # Return the actual number of tokens.
282
    def count
283
      size / 2
284
    end
285
286
    # Include this module to give an object an #undump
287
    # method.
288
    #
289
    # The string returned by Tokens.dump includes Undumping.
290
    module Undumping
291
      # Calls Tokens.load with itself.
292
      def undump
293
        Tokens.load self
294
      end
295
    end
296
297
    # Undump the object using Marshal.load, then
298
    # unzip it using GZip.gunzip.
299
    #
300
    # The result is commonly a Tokens object, but
301
    # this is not guaranteed.
302
    def Tokens.load dump
303
      dump = GZip.gunzip dump
304
      @dump = Marshal.load dump
305
    end
306
307
    alias text_token push
308
    def begin_group kind; push :begin_group, kind end
309
    def end_group kind; push :end_group, kind end
310
    def begin_line kind; push :begin_line, kind end
311
    def end_line kind; push :end_line, kind end
312
    alias tokens concat
313
    
314
  end
315
316
end