add tablepyxl
This commit is contained in:
parent
dfba983c82
commit
e6de4b1e37
|
@ -0,0 +1,13 @@
|
|||
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
|
@ -0,0 +1,283 @@
|
|||
# This is where we handle translating css styles into openpyxl styles
|
||||
# and cascading those from parent to child in the dom.
|
||||
|
||||
from openpyxl.cell import cell
|
||||
from openpyxl.styles import Font, Alignment, PatternFill, NamedStyle, Border, Side, Color
|
||||
from openpyxl.styles.fills import FILL_SOLID
|
||||
from openpyxl.styles.numbers import FORMAT_CURRENCY_USD_SIMPLE, FORMAT_PERCENTAGE
|
||||
from openpyxl.styles.colors import BLACK
|
||||
|
||||
FORMAT_DATE_MMDDYYYY = 'mm/dd/yyyy'
|
||||
|
||||
|
||||
def colormap(color):
|
||||
"""
|
||||
Convenience for looking up known colors
|
||||
"""
|
||||
cmap = {'black': BLACK}
|
||||
return cmap.get(color, color)
|
||||
|
||||
|
||||
def style_string_to_dict(style):
|
||||
"""
|
||||
Convert css style string to a python dictionary
|
||||
"""
|
||||
def clean_split(string, delim):
|
||||
return (s.strip() for s in string.split(delim))
|
||||
styles = [clean_split(s, ":") for s in style.split(";") if ":" in s]
|
||||
return dict(styles)
|
||||
|
||||
|
||||
def get_side(style, name):
|
||||
return {'border_style': style.get('border-{}-style'.format(name)),
|
||||
'color': colormap(style.get('border-{}-color'.format(name)))}
|
||||
|
||||
known_styles = {}
|
||||
|
||||
|
||||
def style_dict_to_named_style(style_dict, number_format=None):
|
||||
"""
|
||||
Change css style (stored in a python dictionary) to openpyxl NamedStyle
|
||||
"""
|
||||
|
||||
style_and_format_string = str({
|
||||
'style_dict': style_dict,
|
||||
'parent': style_dict.parent,
|
||||
'number_format': number_format,
|
||||
})
|
||||
|
||||
if style_and_format_string not in known_styles:
|
||||
# Font
|
||||
font = Font(bold=style_dict.get('font-weight') == 'bold',
|
||||
color=style_dict.get_color('color', None),
|
||||
size=style_dict.get('font-size'))
|
||||
|
||||
# Alignment
|
||||
alignment = Alignment(horizontal=style_dict.get('text-align', 'general'),
|
||||
vertical=style_dict.get('vertical-align'),
|
||||
wrap_text=style_dict.get('white-space', 'nowrap') == 'normal')
|
||||
|
||||
# Fill
|
||||
bg_color = style_dict.get_color('background-color')
|
||||
fg_color = style_dict.get_color('foreground-color', Color())
|
||||
fill_type = style_dict.get('fill-type')
|
||||
if bg_color and bg_color != 'transparent':
|
||||
fill = PatternFill(fill_type=fill_type or FILL_SOLID,
|
||||
start_color=bg_color,
|
||||
end_color=fg_color)
|
||||
else:
|
||||
fill = PatternFill()
|
||||
|
||||
# Border
|
||||
border = Border(left=Side(**get_side(style_dict, 'left')),
|
||||
right=Side(**get_side(style_dict, 'right')),
|
||||
top=Side(**get_side(style_dict, 'top')),
|
||||
bottom=Side(**get_side(style_dict, 'bottom')),
|
||||
diagonal=Side(**get_side(style_dict, 'diagonal')),
|
||||
diagonal_direction=None,
|
||||
outline=Side(**get_side(style_dict, 'outline')),
|
||||
vertical=None,
|
||||
horizontal=None)
|
||||
|
||||
name = 'Style {}'.format(len(known_styles) + 1)
|
||||
|
||||
pyxl_style = NamedStyle(name=name, font=font, fill=fill, alignment=alignment, border=border,
|
||||
number_format=number_format)
|
||||
|
||||
known_styles[style_and_format_string] = pyxl_style
|
||||
|
||||
return known_styles[style_and_format_string]
|
||||
|
||||
|
||||
class StyleDict(dict):
|
||||
"""
|
||||
It's like a dictionary, but it looks for items in the parent dictionary
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.parent = kwargs.pop('parent', None)
|
||||
super(StyleDict, self).__init__(*args, **kwargs)
|
||||
|
||||
def __getitem__(self, item):
|
||||
if item in self:
|
||||
return super(StyleDict, self).__getitem__(item)
|
||||
elif self.parent:
|
||||
return self.parent[item]
|
||||
else:
|
||||
raise KeyError('{} not found'.format(item))
|
||||
|
||||
def __hash__(self):
|
||||
return hash(tuple([(k, self.get(k)) for k in self._keys()]))
|
||||
|
||||
# Yielding the keys avoids creating unnecessary data structures
|
||||
# and happily works with both python2 and python3 where the
|
||||
# .keys() method is a dictionary_view in python3 and a list in python2.
|
||||
def _keys(self):
|
||||
yielded = set()
|
||||
for k in self.keys():
|
||||
yielded.add(k)
|
||||
yield k
|
||||
if self.parent:
|
||||
for k in self.parent._keys():
|
||||
if k not in yielded:
|
||||
yielded.add(k)
|
||||
yield k
|
||||
|
||||
def get(self, k, d=None):
|
||||
try:
|
||||
return self[k]
|
||||
except KeyError:
|
||||
return d
|
||||
|
||||
def get_color(self, k, d=None):
|
||||
"""
|
||||
Strip leading # off colors if necessary
|
||||
"""
|
||||
color = self.get(k, d)
|
||||
if hasattr(color, 'startswith') and color.startswith('#'):
|
||||
color = color[1:]
|
||||
if len(color) == 3: # Premailers reduces colors like #00ff00 to #0f0, openpyxl doesn't like that
|
||||
color = ''.join(2 * c for c in color)
|
||||
return color
|
||||
|
||||
|
||||
class Element(object):
|
||||
"""
|
||||
Our base class for representing an html element along with a cascading style.
|
||||
The element is created along with a parent so that the StyleDict that we store
|
||||
can point to the parent's StyleDict.
|
||||
"""
|
||||
def __init__(self, element, parent=None):
|
||||
self.element = element
|
||||
self.number_format = None
|
||||
parent_style = parent.style_dict if parent else None
|
||||
self.style_dict = StyleDict(style_string_to_dict(element.get('style', '')), parent=parent_style)
|
||||
self._style_cache = None
|
||||
|
||||
def style(self):
|
||||
"""
|
||||
Turn the css styles for this element into an openpyxl NamedStyle.
|
||||
"""
|
||||
if not self._style_cache:
|
||||
self._style_cache = style_dict_to_named_style(self.style_dict, number_format=self.number_format)
|
||||
return self._style_cache
|
||||
|
||||
def get_dimension(self, dimension_key):
|
||||
"""
|
||||
Extracts the dimension from the style dict of the Element and returns it as a float.
|
||||
"""
|
||||
dimension = self.style_dict.get(dimension_key)
|
||||
if dimension:
|
||||
if dimension[-2:] in ['px', 'em', 'pt', 'in', 'cm']:
|
||||
dimension = dimension[:-2]
|
||||
dimension = float(dimension)
|
||||
return dimension
|
||||
|
||||
|
||||
class Table(Element):
|
||||
"""
|
||||
The concrete implementations of Elements are semantically named for the types of elements we are interested in.
|
||||
This defines a very concrete tree structure for html tables that we expect to deal with. I prefer this compared to
|
||||
allowing Element to have an arbitrary number of children and dealing with an abstract element tree.
|
||||
"""
|
||||
def __init__(self, table):
|
||||
"""
|
||||
takes an html table object (from lxml)
|
||||
"""
|
||||
super(Table, self).__init__(table)
|
||||
table_head = table.find('thead')
|
||||
self.head = TableHead(table_head, parent=self) if table_head is not None else None
|
||||
table_body = table.find('tbody')
|
||||
self.body = TableBody(table_body if table_body is not None else table, parent=self)
|
||||
|
||||
|
||||
class TableHead(Element):
|
||||
"""
|
||||
This class maps to the `<th>` element of the html table.
|
||||
"""
|
||||
def __init__(self, head, parent=None):
|
||||
super(TableHead, self).__init__(head, parent=parent)
|
||||
self.rows = [TableRow(tr, parent=self) for tr in head.findall('tr')]
|
||||
|
||||
|
||||
class TableBody(Element):
|
||||
"""
|
||||
This class maps to the `<tbody>` element of the html table.
|
||||
"""
|
||||
def __init__(self, body, parent=None):
|
||||
super(TableBody, self).__init__(body, parent=parent)
|
||||
self.rows = [TableRow(tr, parent=self) for tr in body.findall('tr')]
|
||||
|
||||
|
||||
class TableRow(Element):
|
||||
"""
|
||||
This class maps to the `<tr>` element of the html table.
|
||||
"""
|
||||
def __init__(self, tr, parent=None):
|
||||
super(TableRow, self).__init__(tr, parent=parent)
|
||||
self.cells = [TableCell(cell, parent=self) for cell in tr.findall('th') + tr.findall('td')]
|
||||
|
||||
|
||||
def element_to_string(el):
|
||||
return _element_to_string(el).strip()
|
||||
|
||||
|
||||
def _element_to_string(el):
|
||||
string = ''
|
||||
|
||||
for x in el.iterchildren():
|
||||
string += '\n' + _element_to_string(x)
|
||||
|
||||
text = el.text.strip() if el.text else ''
|
||||
tail = el.tail.strip() if el.tail else ''
|
||||
|
||||
return text + string + '\n' + tail
|
||||
|
||||
|
||||
class TableCell(Element):
|
||||
"""
|
||||
This class maps to the `<td>` element of the html table.
|
||||
"""
|
||||
CELL_TYPES = {'TYPE_STRING', 'TYPE_FORMULA', 'TYPE_NUMERIC', 'TYPE_BOOL', 'TYPE_CURRENCY', 'TYPE_PERCENTAGE',
|
||||
'TYPE_NULL', 'TYPE_INLINE', 'TYPE_ERROR', 'TYPE_FORMULA_CACHE_STRING', 'TYPE_INTEGER'}
|
||||
|
||||
def __init__(self, cell, parent=None):
|
||||
super(TableCell, self).__init__(cell, parent=parent)
|
||||
self.value = element_to_string(cell)
|
||||
self.number_format = self.get_number_format()
|
||||
|
||||
def data_type(self):
|
||||
cell_types = self.CELL_TYPES & set(self.element.get('class', '').split())
|
||||
if cell_types:
|
||||
if 'TYPE_FORMULA' in cell_types:
|
||||
# Make sure TYPE_FORMULA takes precedence over the other classes in the set.
|
||||
cell_type = 'TYPE_FORMULA'
|
||||
elif cell_types & {'TYPE_CURRENCY', 'TYPE_INTEGER', 'TYPE_PERCENTAGE'}:
|
||||
cell_type = 'TYPE_NUMERIC'
|
||||
else:
|
||||
cell_type = cell_types.pop()
|
||||
else:
|
||||
cell_type = 'TYPE_STRING'
|
||||
return getattr(cell, cell_type)
|
||||
|
||||
def get_number_format(self):
|
||||
if 'TYPE_CURRENCY' in self.element.get('class', '').split():
|
||||
return FORMAT_CURRENCY_USD_SIMPLE
|
||||
if 'TYPE_INTEGER' in self.element.get('class', '').split():
|
||||
return '#,##0'
|
||||
if 'TYPE_PERCENTAGE' in self.element.get('class', '').split():
|
||||
return FORMAT_PERCENTAGE
|
||||
if 'TYPE_DATE' in self.element.get('class', '').split():
|
||||
return FORMAT_DATE_MMDDYYYY
|
||||
if self.data_type() == cell.TYPE_NUMERIC:
|
||||
try:
|
||||
int(self.value)
|
||||
except ValueError:
|
||||
return '#,##0.##'
|
||||
else:
|
||||
return '#,##0'
|
||||
|
||||
def format(self, cell):
|
||||
cell.style = self.style()
|
||||
data_type = self.data_type()
|
||||
if data_type:
|
||||
cell.data_type = data_type
|
|
@ -0,0 +1,118 @@
|
|||
# Do imports like python3 so our package works for 2 and 3
|
||||
from __future__ import absolute_import
|
||||
|
||||
from lxml import html
|
||||
from openpyxl import Workbook
|
||||
from openpyxl.utils import get_column_letter
|
||||
from premailer import Premailer
|
||||
from tablepyxl.style import Table
|
||||
|
||||
|
||||
def string_to_int(s):
|
||||
if s.isdigit():
|
||||
return int(s)
|
||||
return 0
|
||||
|
||||
|
||||
def get_Tables(doc):
|
||||
tree = html.fromstring(doc)
|
||||
comments = tree.xpath('//comment()')
|
||||
for comment in comments:
|
||||
comment.drop_tag()
|
||||
return [Table(table) for table in tree.xpath('//table')]
|
||||
|
||||
|
||||
def write_rows(worksheet, elem, row, column=1):
|
||||
"""
|
||||
Writes every tr child element of elem to a row in the worksheet
|
||||
returns the next row after all rows are written
|
||||
"""
|
||||
from openpyxl.cell.cell import MergedCell
|
||||
|
||||
initial_column = column
|
||||
for table_row in elem.rows:
|
||||
for table_cell in table_row.cells:
|
||||
cell = worksheet.cell(row=row, column=column)
|
||||
while isinstance(cell, MergedCell):
|
||||
column += 1
|
||||
cell = worksheet.cell(row=row, column=column)
|
||||
|
||||
colspan = string_to_int(table_cell.element.get("colspan", "1"))
|
||||
rowspan = string_to_int(table_cell.element.get("rowspan", "1"))
|
||||
if rowspan > 1 or colspan > 1:
|
||||
worksheet.merge_cells(start_row=row, start_column=column,
|
||||
end_row=row + rowspan - 1, end_column=column + colspan - 1)
|
||||
|
||||
cell.value = table_cell.value
|
||||
table_cell.format(cell)
|
||||
min_width = table_cell.get_dimension('min-width')
|
||||
max_width = table_cell.get_dimension('max-width')
|
||||
|
||||
if colspan == 1:
|
||||
# Initially, when iterating for the first time through the loop, the width of all the cells is None.
|
||||
# As we start filling in contents, the initial width of the cell (which can be retrieved by:
|
||||
# worksheet.column_dimensions[get_column_letter(column)].width) is equal to the width of the previous
|
||||
# cell in the same column (i.e. width of A2 = width of A1)
|
||||
width = max(worksheet.column_dimensions[get_column_letter(column)].width or 0, len(table_cell.value) + 2)
|
||||
if max_width and width > max_width:
|
||||
width = max_width
|
||||
elif min_width and width < min_width:
|
||||
width = min_width
|
||||
worksheet.column_dimensions[get_column_letter(column)].width = width
|
||||
column += colspan
|
||||
row += 1
|
||||
column = initial_column
|
||||
return row
|
||||
|
||||
|
||||
def table_to_sheet(table, wb):
|
||||
"""
|
||||
Takes a table and workbook and writes the table to a new sheet.
|
||||
The sheet title will be the same as the table attribute name.
|
||||
"""
|
||||
ws = wb.create_sheet(title=table.element.get('name'))
|
||||
insert_table(table, ws, 1, 1)
|
||||
|
||||
|
||||
def document_to_workbook(doc, wb=None, base_url=None):
|
||||
"""
|
||||
Takes a string representation of an html document and writes one sheet for
|
||||
every table in the document.
|
||||
The workbook is returned
|
||||
"""
|
||||
if not wb:
|
||||
wb = Workbook()
|
||||
wb.remove(wb.active)
|
||||
|
||||
inline_styles_doc = Premailer(doc, base_url=base_url, remove_classes=False).transform()
|
||||
tables = get_Tables(inline_styles_doc)
|
||||
|
||||
for table in tables:
|
||||
table_to_sheet(table, wb)
|
||||
|
||||
return wb
|
||||
|
||||
|
||||
def document_to_xl(doc, filename, base_url=None):
|
||||
"""
|
||||
Takes a string representation of an html document and writes one sheet for
|
||||
every table in the document. The workbook is written out to a file called filename
|
||||
"""
|
||||
wb = document_to_workbook(doc, base_url=base_url)
|
||||
wb.save(filename)
|
||||
|
||||
|
||||
def insert_table(table, worksheet, column, row):
|
||||
if table.head:
|
||||
row = write_rows(worksheet, table.head, row, column)
|
||||
if table.body:
|
||||
row = write_rows(worksheet, table.body, row, column)
|
||||
|
||||
|
||||
def insert_table_at_cell(table, cell):
|
||||
"""
|
||||
Inserts a table at the location of an openpyxl Cell object.
|
||||
"""
|
||||
ws = cell.parent
|
||||
column, row = cell.column, cell.row
|
||||
insert_table(table, ws, column, row)
|
Loading…
Reference in New Issue