# ====================================================================
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ====================================================================
from StringIO import StringIO
from HTMLParser import HTMLParser
class InputStreamReader(object):
def __init__(self, inputStream, encoding):
super(InputStreamReader, self).__init__()
self.inputStream = inputStream
self.encoding = encoding or 'utf-8'
def _read(self, length):
return self.inputStream.read(length)
def read(self, length=-1):
text = self._read(length)
text = unicode(text, self.encoding)
return text
def close(self):
self.inputStream.close()
class HTMLReader(object):
def __init__(self, reader):
self.reader = reader
class htmlParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.buffer = StringIO()
self.position = 0
def handle_data(self, data):
self.buffer.write(data)
def _read(self, length):
buffer = self.buffer
size = buffer.tell() - self.position
if length > 0 and size > length:
buffer.seek(self.position)
data = buffer.read(length)
self.position += len(data)
buffer.seek(0, 2)
elif size > 0:
buffer.seek(self.position)
data = buffer.read(size)
self.position = 0
buffer.seek(0)
else:
data = ''
return data
self.parser = htmlParser()
def read(self, length=-1):
while True:
data = self.reader.read(length)
if len(data) > 0:
self.parser.feed(data)
data = self.parser._read(length)
if len(data) == 0:
continue
return data
def close(self):
self.reader.close()
|