Skip to content

mapper.matchers.prefix_matcher

Prefix matcher for statistics-to-geo assignments.

This class has been adapted to the structure of other matchers
  • Direct setupUi(self) instead of wrapping in self.ui → consistent API.
  • All UI accesses go through self.<widget> (instead of self.ui.<widget>).
  • Pure formatting/name changes – logic and return values remain the same.

PrefixMatcher

Bases: BaseMatcher, Ui_PrefixMatcher

Matcher for prefix-based matching between statistics and geo data.

Behavior
  • Takes a prefix of configurable length from the selected columns.
  • Finds unique common prefixes in both tables.
  • Maps exactly one row per common prefix.
Source code in src/mapper/matchers/prefix_matcher.py
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
class PrefixMatcher(BaseMatcher, Ui_PrefixMatcher):
    """
    Matcher for prefix-based matching between statistics and geo data.

    Behavior:
      - Takes a prefix of configurable length from the selected columns.
      - Finds unique common prefixes in both tables.
      - Maps exactly one row per common prefix.
    """

    # --------------------------------------------------------------------------
    #  Constructor
    # --------------------------------------------------------------------------
    def __init__(self, nr: int, stats_cols: List[str], geo_cols: List[str], parent=None) -> None:
        """
        Initialize the prefix matcher widget.

        Steps:
          1. Call BaseMatcher constructor to store identifier and column lists.
          2. Call setupUi(self) to build UI controls from .ui file.
          3. Populate the Excel and geo combo boxes with provided column lists.
          4. Configure the spin box `spinLength` to choose prefix length (range 1–100).
          5. Connect UI signals to the `updated` and `removed` signals.
        """
        super().__init__(nr, stats_cols, geo_cols, parent)

        # Build UI elements from the designer file
        self.setupUi(self)

        # Populate dropdowns with available columns
        self.comboExcel.addItems(stats_cols)
        self.comboGeo.addItems(geo_cols)

        # Configure prefix length spin box
        self.spinLength.setRange(1, 100)
        self.spinLength.setValue(3)

        # Connect UI changes to notify that configuration changed
        self.comboExcel.currentIndexChanged.connect(self.updated)
        self.comboGeo.currentIndexChanged.connect(self.updated)
        self.spinLength.valueChanged.connect(self.updated)
        # Connect remove button to emit removal signal
        self.buttonRemove.clicked.connect(self.removed.emit)

    # --------------------------------------------------------------------------
    #  Matching logic
    # --------------------------------------------------------------------------
    def match(self, stats_df: pd.DataFrame, geo_df: pd.DataFrame) -> Tuple[Optional[pd.DataFrame], Optional[List[int]], Optional[List[int]]]:
        """
        Match records using exact prefix matching.

        Steps:
          1. Retrieve selected column names and prefix length from UI.
          2. If the selected columns are not present, set stats to 0 and return.
          3. Compute prefixes by taking the first `length` characters of string values.
          4. Identify common prefixes between both DataFrames.
          5. For each common prefix, select exactly one statistics row and one geo row.
          6. Build combined result rows via `build_result()`.
          7. Concatenate parts into one DataFrame, update stats label, and return indices.
        """
        stats_col = self.comboExcel.currentText()
        geo_col = self.comboGeo.currentText()
        length = self.spinLength.value()

        # Safety check: ensure selected columns exist
        if stats_col not in stats_df.columns or geo_col not in geo_df.columns:
            self.set_stats(0)
            return None, None, None

        # Build prefixes for each row
        stats_prefixes = stats_df[stats_col].astype(str).str[:length]
        geo_prefixes = geo_df[geo_col].astype(str).str[:length]

        # Find common prefixes
        common_prefixes = pd.Index(stats_prefixes.unique()).intersection(geo_prefixes.unique())
        if common_prefixes.empty:
            self.set_stats(0)
            return None, None, None

        parts: list[pd.DataFrame] = []
        ex_idx: list[int] = []
        ge_idx: list[int] = []
        label = self.description()

        # Map exactly one row per prefix
        for p in common_prefixes:
            ex_row = stats_df[stats_prefixes == p].iloc[[0]]
            ge_row = geo_df[geo_prefixes == p].iloc[[0]]
            parts.append(self.build_result(ex_row, ge_row, label))
            ex_idx.append(ex_row.index[0])
            ge_idx.append(ge_row.index[0])

        # Concatenate matched parts into a single DataFrame
        mapped = pd.concat(parts, ignore_index=True)
        # Update stats label with count of matched rows
        self.set_stats(len(mapped))
        return mapped, ex_idx, ge_idx

    # --------------------------------------------------------------------------
    #  Update column lists when upstream data changes
    # --------------------------------------------------------------------------
    # def update_stats_columns(self, cols: List[str]) -> None:
    #     """
    #     Refresh the statistics column dropdown when column set changes.
    #
    #     Steps:
    #       1. Call parent method to update internal list.
    #       2. Remember currently selected column.
    #       3. Clear and repopulate the combo box with new columns.
    #       4. Reselect the previously chosen column if still available.
    #     """
    #     super().update_stats_columns(cols)
    #     cur = self.comboExcel.currentText()
    #     self.comboExcel.clear()
    #     self.comboExcel.addItems(cols)
    #     if cur in cols:
    #         self.comboExcel.setCurrentText(cur)

    # def update_geo_columns(self, cols: List[str]) -> None:
    #     """
    #     Refresh the geo column dropdown when column set changes.
    #
    #     Steps:
    #       1. Call parent method to update internal list.
    #       2. Remember currently selected column.
    #       3. Clear and repopulate the combo box with new columns.
    #       4. Reselect the previously chosen column if still available.
    #     """
    #     super().update_geo_columns(cols)
    #     cur = self.comboGeo.currentText()
    #     self.comboGeo.clear()
    #     self.comboGeo.addItems(cols)
    #     if cur in cols:
    #         self.comboGeo.setCurrentText(cur)

    # --------------------------------------------------------------------------
    #  Stats display helper
    # --------------------------------------------------------------------------
    def set_stats(self, n: int) -> None:
        """
        Update the label that displays number of matches found.

        Steps:
          1. Convert integer `n` to string and set it on `labelStats`.
        """
        self.labelStats.setText(str(n))

    # --------------------------------------------------------------------------
    #  Description provider
    # --------------------------------------------------------------------------
    def description(self) -> str:
        """
        Provide a description of this matcher's configuration.

        Steps:
          1. Combine the matcher ID, selected stats column, selected geo column,
             and prefix length into a string.
          2. Format as "PRE#<nr>:<stats>→<geo>[<length>]".
        """
        return f"PRE#{self._nr}:{self.comboExcel.currentText()}→" f"{self.comboGeo.currentText()}[{self.spinLength.value()}]"

__init__(nr, stats_cols, geo_cols, parent=None)

Initialize the prefix matcher widget.

Steps
  1. Call BaseMatcher constructor to store identifier and column lists.
  2. Call setupUi(self) to build UI controls from .ui file.
  3. Populate the Excel and geo combo boxes with provided column lists.
  4. Configure the spin box spinLength to choose prefix length (range 1–100).
  5. Connect UI signals to the updated and removed signals.
Source code in src/mapper/matchers/prefix_matcher.py
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def __init__(self, nr: int, stats_cols: List[str], geo_cols: List[str], parent=None) -> None:
    """
    Initialize the prefix matcher widget.

    Steps:
      1. Call BaseMatcher constructor to store identifier and column lists.
      2. Call setupUi(self) to build UI controls from .ui file.
      3. Populate the Excel and geo combo boxes with provided column lists.
      4. Configure the spin box `spinLength` to choose prefix length (range 1–100).
      5. Connect UI signals to the `updated` and `removed` signals.
    """
    super().__init__(nr, stats_cols, geo_cols, parent)

    # Build UI elements from the designer file
    self.setupUi(self)

    # Populate dropdowns with available columns
    self.comboExcel.addItems(stats_cols)
    self.comboGeo.addItems(geo_cols)

    # Configure prefix length spin box
    self.spinLength.setRange(1, 100)
    self.spinLength.setValue(3)

    # Connect UI changes to notify that configuration changed
    self.comboExcel.currentIndexChanged.connect(self.updated)
    self.comboGeo.currentIndexChanged.connect(self.updated)
    self.spinLength.valueChanged.connect(self.updated)
    # Connect remove button to emit removal signal
    self.buttonRemove.clicked.connect(self.removed.emit)

description()

Provide a description of this matcher's configuration.

Steps
  1. Combine the matcher ID, selected stats column, selected geo column, and prefix length into a string.
  2. Format as "PRE#:[]".
Source code in src/mapper/matchers/prefix_matcher.py
174
175
176
177
178
179
180
181
182
183
def description(self) -> str:
    """
    Provide a description of this matcher's configuration.

    Steps:
      1. Combine the matcher ID, selected stats column, selected geo column,
         and prefix length into a string.
      2. Format as "PRE#<nr>:<stats>→<geo>[<length>]".
    """
    return f"PRE#{self._nr}:{self.comboExcel.currentText()}→" f"{self.comboGeo.currentText()}[{self.spinLength.value()}]"

match(stats_df, geo_df)

Match records using exact prefix matching.

Steps
  1. Retrieve selected column names and prefix length from UI.
  2. If the selected columns are not present, set stats to 0 and return.
  3. Compute prefixes by taking the first length characters of string values.
  4. Identify common prefixes between both DataFrames.
  5. For each common prefix, select exactly one statistics row and one geo row.
  6. Build combined result rows via build_result().
  7. Concatenate parts into one DataFrame, update stats label, and return indices.
Source code in src/mapper/matchers/prefix_matcher.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
def match(self, stats_df: pd.DataFrame, geo_df: pd.DataFrame) -> Tuple[Optional[pd.DataFrame], Optional[List[int]], Optional[List[int]]]:
    """
    Match records using exact prefix matching.

    Steps:
      1. Retrieve selected column names and prefix length from UI.
      2. If the selected columns are not present, set stats to 0 and return.
      3. Compute prefixes by taking the first `length` characters of string values.
      4. Identify common prefixes between both DataFrames.
      5. For each common prefix, select exactly one statistics row and one geo row.
      6. Build combined result rows via `build_result()`.
      7. Concatenate parts into one DataFrame, update stats label, and return indices.
    """
    stats_col = self.comboExcel.currentText()
    geo_col = self.comboGeo.currentText()
    length = self.spinLength.value()

    # Safety check: ensure selected columns exist
    if stats_col not in stats_df.columns or geo_col not in geo_df.columns:
        self.set_stats(0)
        return None, None, None

    # Build prefixes for each row
    stats_prefixes = stats_df[stats_col].astype(str).str[:length]
    geo_prefixes = geo_df[geo_col].astype(str).str[:length]

    # Find common prefixes
    common_prefixes = pd.Index(stats_prefixes.unique()).intersection(geo_prefixes.unique())
    if common_prefixes.empty:
        self.set_stats(0)
        return None, None, None

    parts: list[pd.DataFrame] = []
    ex_idx: list[int] = []
    ge_idx: list[int] = []
    label = self.description()

    # Map exactly one row per prefix
    for p in common_prefixes:
        ex_row = stats_df[stats_prefixes == p].iloc[[0]]
        ge_row = geo_df[geo_prefixes == p].iloc[[0]]
        parts.append(self.build_result(ex_row, ge_row, label))
        ex_idx.append(ex_row.index[0])
        ge_idx.append(ge_row.index[0])

    # Concatenate matched parts into a single DataFrame
    mapped = pd.concat(parts, ignore_index=True)
    # Update stats label with count of matched rows
    self.set_stats(len(mapped))
    return mapped, ex_idx, ge_idx

set_stats(n)

Update the label that displays number of matches found.

Steps
  1. Convert integer n to string and set it on labelStats.
Source code in src/mapper/matchers/prefix_matcher.py
162
163
164
165
166
167
168
169
def set_stats(self, n: int) -> None:
    """
    Update the label that displays number of matches found.

    Steps:
      1. Convert integer `n` to string and set it on `labelStats`.
    """
    self.labelStats.setText(str(n))