Source code for etk.extractors.excel_extractor

from typing import List, Dict, Tuple

import copy
import re
import pyexcel

from etk.etk import ETK
from etk.extractor import Extractor, InputType
from etk.extraction import Extraction


[docs]class ExcelExtractor(Extractor):
    """
    **Description**
        This class is a wrapper of pyexcel. By defining 'variables' argument,
        this extractor is able to extract multiple information and store them
        into different field at once.

    Examples:
        ::

            excel_extractor = ExcelExtractor()
            excel_extractor.extract(file_name:path_to_excel,
                                    sheet_name:sheet_name,
                                    region:['A,1', 'Z,10'],
                                    variables={
                                        # get current col & row for field1
                                        'field1': '$col,$row',
                                        # get fixed cell of $A,$5 for field2
                                        'field2': '$A,$5'
                                        })

    """

    _re_row_identifier = re.compile(r'(\$[0-9]+)')
    _re_col_identifier = re.compile(r'(\$[A-Za-z]+)')

    def __init__(self, etk: ETK = None, extractor_name: str = 'excel extractor') -> None:
        Extractor.__init__(self,
                           input_type=InputType.TEXT,
                           category="data extractor",
                           name=extractor_name)
        self._etk = etk

[docs]    def extract(self, file_name: str, sheet_name: str, region: List, variables: Dict) -> List[Extraction]:
        """
        Args:
            file_name (str): file name
            sheet_name (str): sheet name
            region (List[]): from upper left cell to bottom right cell, e.g., ['A,1', 'Z,10']
            variables (Dict): key is variable name, value can be:
                              1. a single expression 2. comma separated expression, will be treated as location
                              $row, $col are built-in variables can be used in expression
                              constant row and column value can be noted as $NAME (e.g., $1, $10, $A, $GG)
        
        Returns:
            List[Extraction] : A list of extracted variables dictionary
            
        """
        extractions = []

        book = pyexcel.get_book(file_name=file_name)
        sheet = book[sheet_name]
        region = [ExcelExtractor._excel_coord_to_location(coord) for coord in region]
        r = region[0][0]
        # per row
        for row in sheet.region(region[0], region[1]):
            c = region[0][1]
            # per col
            for col in row:
                var = copy.deepcopy(variables)
                # per variable
                for k, v in var.items():
                    parsed_v = ExcelExtractor._parse_variable(v, r, c)
                    if len(parsed_v) == 1:  # normal variable
                        var[k] = parsed_v[0]
                    else:  # location
                        rr, cc = parsed_v
                        var[k] = sheet[rr, cc]
                    extractions.append(var)

                c += 1
            r += 1

        return extractions

    @staticmethod
    def _col_name_to_num(name: str) -> int:
        name = name.upper()
        pow = 1
        col_num = 0
        for letter in name[::-1]:
            col_num += (int(letter, 36) - 9) * pow
            pow *= 26
        return col_num - 1

    @staticmethod
    def _row_name_to_num(name: str) -> int:
        try:
            num = int(name) - 1
            if num >= 0:
                return num
            raise ValueError('Invalid row name')
        except:
            raise ValueError('Invalid row name')

    @staticmethod
    def _excel_coord_to_location(s: str) -> Tuple:
        ss = s.split(',')
        return ExcelExtractor._row_name_to_num(ss[1]), ExcelExtractor._col_name_to_num(ss[0])

    @staticmethod
    def _parse_variable(s: str, curr_row: int, curr_col: int) -> Tuple:
        '''
        $A,$2 <- constant col and row
        $row,$2 <- current col, row 2
        $A+1,$2 <- col A + 1 = 2, row 2
        $row+1,$2 <- current col + 1, row 2
        $A,$2-1 <-- col A, row 2 - 1 = 1
        '''

        def parse_expression(ss, curr_row, curr_col):
            ss = ss.replace('$row', str(curr_row))
            ss = ss.replace('$col', str(curr_col))
            ss = ExcelExtractor._re_row_identifier.sub(
                lambda x: str(ExcelExtractor._row_name_to_num(x.group()[1:])) if len(x.group()) > 0 else '', ss)
            ss = ExcelExtractor._re_col_identifier.sub(
                lambda x: str(ExcelExtractor._col_name_to_num(x.group()[1:])) if len(x.group()) > 0 else '', ss)
            return eval(ss)

        ss = s.split(',')
        if len(ss) == 1:
            return parse_expression(ss[0], curr_row, curr_col),
        elif len(ss) == 2:
            rr, cc = (ss[1], ss[0])
            return parse_expression(rr, curr_row, curr_col), parse_expression(cc, curr_row, curr_col)
        else:
            raise ValueError('Invalid variable')