Source code for etk.extractors.excel_extractor
from typing import List, Dict, Tuple
import copy
import re
import pyexcel
from etk.etk import ETK
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction
[docs]class ExcelExtractor(Extractor):
"""
**Description**
This class is a wrapper of pyexcel. By defining 'variables' argument,
this extractor is able to extract multiple information and store them
into different field at once.
Examples:
::
excel_extractor = ExcelExtractor()
excel_extractor.extract(file_name:path_to_excel,
sheet_name:sheet_name,
region:['A,1', 'Z,10'],
variables={
# get current col & row for field1
'field1': '$col,$row',
# get fixed cell of $A,$5 for field2
'field2': '$A,$5'
})
"""
_re_row_identifier = re.compile(r'(\$[0-9]+)')
_re_col_identifier = re.compile(r'(\$[A-Za-z]+)')
def __init__(self, etk: ETK = None, extractor_name: str = 'excel extractor') -> None:
Extractor.__init__(self,
input_type=InputType.TEXT,
category="data extractor",
name=extractor_name)
self._etk = etk
[docs] def extract(self, file_name: str, sheet_name: str, region: List, variables: Dict) -> List[Extraction]:
"""
Args:
file_name (str): file name
sheet_name (str): sheet name
region (List[]): from upper left cell to bottom right cell, e.g., ['A,1', 'Z,10']
variables (Dict): key is variable name, value can be:
1. a single expression 2. comma separated expression, will be treated as location
$row, $col are built-in variables can be used in expression
constant row and column value can be noted as $NAME (e.g., $1, $10, $A, $GG)
Returns:
List[Extraction] : A list of extracted variables dictionary
"""
extractions = []
book = pyexcel.get_book(file_name=file_name)
sheet = book[sheet_name]
region = [ExcelExtractor._excel_coord_to_location(coord) for coord in region]
r = region[0][0]
# per row
for row in sheet.region(region[0], region[1]):
c = region[0][1]
# per col
for col in row:
var = copy.deepcopy(variables)
# per variable
for k, v in var.items():
parsed_v = ExcelExtractor._parse_variable(v, r, c)
if len(parsed_v) == 1: # normal variable
var[k] = parsed_v[0]
else: # location
rr, cc = parsed_v
var[k] = sheet[rr, cc]
extractions.append(var)
c += 1
r += 1
return extractions
@staticmethod
def _col_name_to_num(name: str) -> int:
name = name.upper()
pow = 1
col_num = 0
for letter in name[::-1]:
col_num += (int(letter, 36) - 9) * pow
pow *= 26
return col_num - 1
@staticmethod
def _row_name_to_num(name: str) -> int:
try:
num = int(name) - 1
if num >= 0:
return num
raise ValueError('Invalid row name')
except:
raise ValueError('Invalid row name')
@staticmethod
def _excel_coord_to_location(s: str) -> Tuple:
ss = s.split(',')
return ExcelExtractor._row_name_to_num(ss[1]), ExcelExtractor._col_name_to_num(ss[0])
@staticmethod
def _parse_variable(s: str, curr_row: int, curr_col: int) -> Tuple:
'''
$A,$2 <- constant col and row
$row,$2 <- current col, row 2
$A+1,$2 <- col A + 1 = 2, row 2
$row+1,$2 <- current col + 1, row 2
$A,$2-1 <-- col A, row 2 - 1 = 1
'''
def parse_expression(ss, curr_row, curr_col):
ss = ss.replace('$row', str(curr_row))
ss = ss.replace('$col', str(curr_col))
ss = ExcelExtractor._re_row_identifier.sub(
lambda x: str(ExcelExtractor._row_name_to_num(x.group()[1:])) if len(x.group()) > 0 else '', ss)
ss = ExcelExtractor._re_col_identifier.sub(
lambda x: str(ExcelExtractor._col_name_to_num(x.group()[1:])) if len(x.group()) > 0 else '', ss)
return eval(ss)
ss = s.split(',')
if len(ss) == 1:
return parse_expression(ss[0], curr_row, curr_col),
elif len(ss) == 2:
rr, cc = (ss[1], ss[0])
return parse_expression(rr, curr_row, curr_col), parse_expression(cc, curr_row, curr_col)
else:
raise ValueError('Invalid variable')