angr.analyses.decompiler.optimization_passes.duplication_reverter.duplication_reverter 源代码

from __future__ import annotations
from collections import defaultdict
import logging
from itertools import combinations
import itertools

import networkx as nx

import ailment
from ailment.block import Block
from ailment.statement import ConditionalJump, Jump, Assignment, Return, Label
from ailment.expression import Const, Register, Convert, Expression

from .ail_merge_graph import AILMergeGraph, AILBlockSplit
from .errors import SAILRSemanticError
from .similarity import longest_ail_graph_subseq

from .utils import (
    replace_node_in_graph,
    find_block_in_successors_by_addr,
    copy_graph_and_nodes,
    correct_jump_targets,
    deepcopy_ail_anyjump,
)
from angr.analyses.decompiler.optimization_passes.optimization_pass import StructuringOptimizationPass
from angr.analyses.decompiler.block_io_finder import BlockIOFinder
from angr.analyses.decompiler.block_similarity import is_similar, index_of_similar_stmts, longest_ail_subseq
from angr.analyses.decompiler.utils import to_ail_supergraph, remove_labels
from angr.analyses.decompiler.counters.boolean_counter import BooleanCounter
from angr.knowledge_plugins.key_definitions.atoms import MemoryLocation
from angr.utils.graph import dominates

_l = logging.getLogger(name=__name__)


[文档] class DuplicationReverter(StructuringOptimizationPass): """ This (de)optimization reverts the effects of many compiler optimizations that cause code duplication in the decompilation. This deoptimization is the implementation of the USENIX 2024 paper SAILR's ISD doptimization. As such, the main goal of this optimization is to remove code duplication by merging semantically similar blocks in the AIL graph. """ NAME = "Revert Statement Duplication Optimizations" DESCRIPTION = __doc__.strip()
[文档] def __init__(self, func, max_guarding_conditions=4, **kwargs): super().__init__( func, prevent_new_gotos=True, strictly_less_gotos=False, recover_structure_fails=True, must_improve_rel_quality=True, max_opt_iters=5, simplify_ail=True, require_gotos=True, readd_labels=True, **kwargs, ) self.max_guarding_conditions = max_guarding_conditions self.write_graph: nx.DiGraph | None = None self.read_graph: nx.DiGraph | None = None self.candidate_blacklist = set() # cache items self._idom_cache = {} self._entry_node_cache = {} self.analyze()
# # Superclass methods # def _check(self): return True, {} def _get_new_gotos(self): future_irreducible_gotos = self._find_future_irreducible_gotos() return [goto for goto in self._goto_manager.gotos if goto not in future_irreducible_gotos] # # Main Analysis # def _analyze(self, cache=None) -> bool: """ This function is the main analysis function for this deoptimization which implements SAILR's ISD deoptimization. There are generally three steps to this deoptimization: 1. Search for candidates to merge based on the ISD-schema 2. Construct the middle graph/node that is merged from the duplicate candidate 3. Reinsert the merged candidate into the original graph Of these stages, the later two are the most complex. In stage 2, we create a new AILMergeGraph that represents the merging of two subgraphs that are duplicates. This stage will also record how blocks map to the split forms (see AILMergeGraph class string for more information). During this stage, semantic failures can happen, which mean that while creating the merged graph we encounter a scenario that is non-verifiable to not harm the graph. In these cases, we bail. In stage 3, we reinsert the merged candidate into the original graph. This stage is also a little messy because need to correct every jump address. Finally, the _analyze function returns True if the analysis was successful and a change was made to the graph. In this case, we return True if this optimization requires another iteration, and False if it does not. It can be True even if no changes were made to the graph. """ # construct graphs for writing and reading so we can corrupt the write graph # but still have a clean copy to read from graph = self.out_graph or self._graph self.write_graph = remove_labels(to_ail_supergraph(copy_graph_and_nodes(graph), allow_fake=True)) self.read_graph: nx.DiGraph = self.write_graph.copy() # phase 1: search for candidates to merge based on the ISD-schema candidate = self._search_for_deduplication_candidate() if candidate is None: return False # phase 2: construct the middle graph/node that is merged from the duplicate candidate try: ail_merge_graph, candidate = self._construct_merged_candidate(candidate) except SAILRSemanticError as e: _l.debug("Skipping this candidate because of %s...", e) self.candidate_blacklist.add(tuple(candidate)) return True # phase 3: reinsert the merged candidate into the original graph success = self._reinsert_merged_candidate(ail_merge_graph, candidate) if not success: self.candidate_blacklist.add(tuple(candidate)) return True self.out_graph = to_ail_supergraph(self.write_graph) return True def _search_for_deduplication_candidate(self) -> tuple[Block, Block] | None: candidates = self._find_initial_candidates() if not candidates: _l.debug("There are no duplicate statements in this function, stopping analysis") return None # with merge_candidates=False, max size for a candidate is 2 candidates = self._filter_candidates(candidates, merge_candidates=False) if not candidates: _l.debug("There are no duplicate blocks in this function, stopping analysis") return None candidates = sorted(candidates, key=len) _l.debug("Located %d candidates for merging: %s", len(candidates), candidates) candidate = sorted(candidates[0], key=lambda x: x.addr) _l.debug("Selecting the candidate: %s", candidate) return candidate[0], candidate[1] def _construct_merged_candidate( self, candidate: tuple[Block, Block] ) -> tuple[AILMergeGraph, tuple[Block, Block]] | None: ail_merge_graph = self.create_merged_subgraph(candidate, self.write_graph) new_candidate = ail_merge_graph.starts for block in ail_merge_graph.original_ends: if self._block_has_goto_edge( block, [b for b in ail_merge_graph.original_ends if b is not block], graph=self.write_graph ): break else: raise SAILRSemanticError("An initial candidate was incorrectly reported to have gotos at it's ends!") return ail_merge_graph, new_candidate def _reinsert_merged_candidate(self, ail_merge_graph: AILMergeGraph, candidate: tuple[Block, Block]) -> bool: og_succs, og_preds = {}, {} for original_blocks in ail_merge_graph.original_blocks.values(): # collect all the old edges for og_block in original_blocks: og_succs[og_block] = list(self.write_graph.successors(og_block)) og_preds[og_block] = list(self.write_graph.predecessors(og_block)) # delete all the blocks that will be merged into the merge_graph self.write_graph.remove_nodes_from(original_blocks) # add the new graph in to the original graph self.write_graph = nx.compose(self.write_graph, ail_merge_graph.graph) # connect all the out-edges that may have been altered for merged_node, originals in ail_merge_graph.merge_blocks_to_originals.items(): last_stmt = merged_node.statements[-1] curr_succs = list(self.write_graph.successors(merged_node)) # skip any nodes that already have enough successors broken_conditional_jump = not isinstance(last_stmt, (ConditionalJump, Jump)) and len(curr_succs) == 1 if ( broken_conditional_jump or (isinstance(last_stmt, Jump) and len(curr_succs) == 1) or (isinstance(last_stmt, ConditionalJump) and len(curr_succs) == 2) ): continue all_og_succs = set() for orig in originals: orig_block = orig.original if isinstance(orig, AILBlockSplit) else orig if orig_block not in og_succs: continue for og_suc in og_succs[orig_block]: if og_suc not in self.write_graph: continue all_og_succs.add(og_suc) # no if-stmt updating is needed here! for og_succ in all_og_succs: self.write_graph.add_edge(merged_node, og_succ) # correct all the in-edges that may have been altered all_preds = set() for block in candidate: for original in ail_merge_graph.original_blocks[block]: if original not in og_preds: continue orig_preds = og_preds[original] for orig_pred in orig_preds: if orig_pred not in self.write_graph: continue all_preds.add(orig_pred) for orig_pred in all_preds: last_stmt = orig_pred.statements[-1] if isinstance(last_stmt, (Jump, ConditionalJump)): target_addrs = [] if isinstance(last_stmt, Jump): if not isinstance(last_stmt.target, Const): _l.debug("Candidate %s is a child of an indirect-jump, which is not supported", candidate) self.write_graph = self.read_graph.copy() return False target_addrs = [last_stmt.target.value] if isinstance(last_stmt.target, Const) else [] elif isinstance(last_stmt, ConditionalJump): target_addrs = [last_stmt.true_target.value, last_stmt.false_target.value] replacement_map = {} for target_addr in target_addrs: target_candidates = [] for mblock, oblocks in ail_merge_graph.merge_blocks_to_originals.items(): for oblock in oblocks: if (isinstance(oblock, AILBlockSplit) and oblock.original.addr == target_addr) or ( isinstance(oblock, Block) and oblock.addr == target_addr ): target_candidates.append(mblock) if not target_candidates: continue new_target = None curr_succs = list(self.write_graph.successors(orig_pred)) target_candidates = [t for t in target_candidates if t not in curr_succs] for target_can in target_candidates: if target_can.addr == target_addr: new_target = target_can break if new_target is None: for target_can in target_candidates: found = False for orig in ail_merge_graph.merge_blocks_to_originals[target_can]: if isinstance(orig, Block): new_target = target_can found = True break if found: break if new_target is None: for split_type in ["up_split", "match_split", "down_split"]: found = False for target_can in target_candidates: if ail_merge_graph.merged_is_split_type(target_can, split_type): new_target = target_can found = True break if found: break if new_target is None: _l.debug("Unable to correct a predecessor, this is a bug!") self.write_graph = self.read_graph.copy() return False replacement_map[target_addr] = new_target.addr self.write_graph.add_edge(orig_pred, new_target) new_pred = orig_pred.copy() new_pred.statements[-1] = correct_jump_targets(new_pred.statements[-1], replacement_map, new_stmt=True) if new_pred != orig_pred: replace_node_in_graph(self.write_graph, orig_pred, new_pred) else: # we are at a block that has no ending, if this block does not end in one successor, then # it is just an incorrect graph orig_pred_succs = list(self.read_graph.successors(orig_pred)) assert len(orig_pred_succs) == 1 orig_pred_succ = orig_pred_succs[0] new_succ = None for merge, originals in ail_merge_graph.merge_blocks_to_originals.items(): found = False for og in originals: if (og == orig_pred_succ) or (isinstance(og, AILBlockSplit) and og.original == orig_pred_succ): new_succ = merge found = True break if found: break if new_succ is None: _l.debug("Unable to find the successor for block with no jump or condition!") self.write_graph = self.read_graph.copy() return False self.write_graph.add_edge(orig_pred, new_succ) self.write_graph = self._correct_all_broken_jumps(self.write_graph) self.write_graph = self._uniquify_addrs(self.write_graph) _l.info("Candidate merge successful on blocks: %s", candidate) return True # # Helpers # def _uniquify_addrs(self, graph): new_graph = nx.DiGraph() new_nodes = {} nodes_by_addr = defaultdict(list) for node in graph.nodes: nodes_by_addr[node.addr].append(node) for _, nodes in nodes_by_addr.items(): if len(nodes) == 1: continue # we have multiple nodes with the same address duplicate_addr_nodes = sorted(nodes, key=lambda x: (x.idx or -1), reverse=True) for duplicate_node in duplicate_addr_nodes: new_node = duplicate_node.copy() new_node.idx = None new_addr = self.new_block_addr() new_node.addr = new_addr for i, stmt in enumerate(new_node.statements): if stmt.tags and "ins_addr" in stmt.tags: stmt.tags["ins_addr"] = new_addr + i + 1 new_nodes[duplicate_node] = new_node # reset the idx for all of them since they are unique now, also fix the jump targets idx for node in graph.nodes: new_node = new_nodes[node] if node in new_nodes else node.copy() new_node.idx = None if new_node.statements and isinstance(new_node.statements[-1], Jump): new_node.statements[-1].target_idx = None new_nodes[node] = new_node # fixup every single jump target (before adding them to the graph) for src, dst in graph.edges(): new_src = new_nodes[src] new_dst = new_nodes[dst] if new_dst is not dst: new_new_src = new_src.copy() new_new_src.statements[-1] = correct_jump_targets(new_new_src.statements[-1], {dst.addr: new_dst.addr}) new_nodes[src] = new_new_src # add all the nodes in the same order back to the graph for node in graph.nodes: new_graph.add_node(new_nodes[node]) for src, dst, data in graph.edges(data=True): new_graph.add_edge(new_nodes[src], new_nodes[dst], **data) return new_graph def _correct_all_broken_jumps(self, graph): new_graph = nx.DiGraph() new_nodes = {} for node in graph.nodes: # correct the last statement of the node for single-successor nodes new_node = node if graph.out_degree(node) == 1: last_stmt = node.statements[-1] successor = next(iter(graph.successors(node))) if isinstance(last_stmt, Jump): if last_stmt.target.value != successor.addr: new_last_stmt = deepcopy_ail_anyjump(last_stmt, idx=last_stmt.idx) last_stmt.target_idx = successor.idx new_last_stmt.target = Const(None, None, successor.addr, self.project.arch.bits) new_node = node.copy() new_node.statements[-1] = new_last_stmt # the last statement is not a jump, but this node should have one, so add it else: new_node = node.copy() new_last_stmt = Jump( None, Const(None, None, successor.addr, self.project.arch.bits), target_idx=successor.idx ) # TODO: improve addressing here new_last_stmt.tags["ins_addr"] = new_node.addr + 1 new_node.statements.append(new_last_stmt) elif graph.out_degree(node) == 2: last_stmt = node.statements[-1] if isinstance(last_stmt, ConditionalJump): real_successor_addrs = [_n.addr for _n in graph.successors(node)] addr_map = {} unmapped_addrs = [] for target in (last_stmt.true_target, last_stmt.false_target): if target.value in real_successor_addrs: addr_map[target.value] = target.value real_successor_addrs.remove(target.value) else: unmapped_addrs.append(target.value) # right now we can only correct cases where one edge is incorrect if len(real_successor_addrs) == 1 and len(unmapped_addrs) == 1: addr_map[unmapped_addrs[0]] = real_successor_addrs[0] new_last_stmt = correct_jump_targets(last_stmt, addr_map, new_stmt=True) new_node = node.copy() new_node.statements[-1] = new_last_stmt new_nodes[node] = new_node new_graph.add_node(new_node) for src, dst, data in graph.edges(data=True): new_graph.add_edge(new_nodes[src], new_nodes[dst], **data) return new_graph def _construct_best_condition_block_for_merge(self, blocks, graph) -> tuple[Block, Block]: # find the conditions that block both of these blocks common_cond = self.shared_common_conditional_dom(blocks, graph) conditions_by_start = self.collect_conditions_between_nodes(graph, common_cond, blocks) best_condition_pair = None for start, condition in conditions_by_start.items(): if best_condition_pair is None: best_condition_pair = (start, condition) continue if isinstance(condition, Const): continue _, best_cond = best_condition_pair if self.boolean_operators_in_condition(condition) < self.boolean_operators_in_condition(best_cond): best_condition_pair = start, condition true_block, best_condition = best_condition_pair boolean_cnt = self.boolean_operators_in_condition(best_condition) if boolean_cnt >= self.max_guarding_conditions: self.candidate_blacklist.add(tuple(blocks)) raise SAILRSemanticError("A condition would be too long for a fixup, this analysis must skip it") cond_block = Block(common_cond.addr, 1, idx=common_cond.idx + 1 if isinstance(common_cond.idx, int) else 1) old_stmt_tags = common_cond.statements[0].tags cond_jump = ConditionalJump( 1, best_condition.copy() if best_condition is not None else None, Const(None, None, 0, self.project.arch.bits), Const(None, None, 0, self.project.arch.bits), **old_stmt_tags, ) cond_block.statements = [cond_jump] return cond_block, true_block
[文档] @staticmethod def boolean_operators_in_condition(condition: Expression): """ TODO: this entire boolean checking semantic we use needs to be removed, see how it is used for other dels needed we need to replace it with a boolean variable insertion on both branches that lead to the new block say we have: if (A()) { do_thing(); } if (B()) { do_thing(): } We want to translate it to: int should_do_thing = 0; if (A()) should_do_thing = 1; if (B()) should_do_thing = 1; if (should_do_thing): do_thing(); Although longer, this code can be optimized to look like: int should_do_thing = A() || B(); if (should_do_thing) do_thing(); """ walker = BooleanCounter() walker.walk_expression(condition) return walker.boolean_cnt
@staticmethod def _input_defined_by_other_stmt(target_idx, other_idx, io_finder): target_inputs = io_finder.inputs_by_stmt[target_idx] # any memory location, not on stack, is not movable if any(isinstance(i, MemoryLocation) and not i.is_on_stack for i in target_inputs): return True other_outputs = io_finder.outputs_by_stmt[other_idx] return target_inputs.intersection(other_outputs) @staticmethod def _output_used_by_other_stmt(target_idx, other_idx, io_finder): target_output = io_finder.outputs_by_stmt[target_idx] # any memory location, not on stack, is not movable if any(isinstance(o, MemoryLocation) and not o.is_on_stack for o in target_output): return True other_input = io_finder.inputs_by_stmt[other_idx] return target_output.intersection(other_input)
[文档] def stmt_can_move_to(self, stmt, block, new_idx, io_finder=None): if stmt not in block.statements: raise NotImplementedError("Statement not in block, and we can't compute moving a stmt to a new block!") # jumps of any kind are not moveable if ( new_idx == len(block.statements) - 1 and isinstance(block.statements[new_idx], (ConditionalJump, Jump)) ) or isinstance(stmt, (ConditionalJump, Jump)): return False io_finder = io_finder or BlockIOFinder(block, self.project) curr_idx = block.statements.index(stmt) move_up = new_idx < curr_idx # moving a statement up in the statements: # we must check if it's defined by anything above it (lower in index) can_move = True if move_up: # exclude curr_idx in range for mid_idx in range(new_idx, curr_idx): if self._input_defined_by_other_stmt(curr_idx, mid_idx, io_finder): can_move = False break # moving a statement down in the statements: # we much check if it's used by anything below it (greater in index) else: for mid_idx in range(curr_idx + 1, new_idx + 1): if self._output_used_by_other_stmt(curr_idx, mid_idx, io_finder): can_move = False break return can_move
[文档] def maximize_similarity_of_blocks(self, block1, block2, graph) -> tuple[Block, Block]: """ This attempts to rearrange the order of statements in block1 and block2 to maximize the similarity between them. This implementation is a little outdated since CodeMotion optimization was implemented, but it should be disabled until we have a good SSA implementation. TODO: reimplement me when we have better SSA """ new_block1, new_block2 = block1.copy(), block2.copy() updates = True prev_moved = set() while updates: updates = False _, lcs_idxs = longest_ail_subseq([new_block1.statements, new_block2.statements]) lcs_idx_by_block = {new_block1: lcs_idxs[0], new_block2: lcs_idxs[1]} if any(v is None for v in lcs_idx_by_block.values()): break io_finder_by_block = { new_block1: BlockIOFinder(new_block1, self.project), new_block2: BlockIOFinder(new_block2, self.project), } for search_offset in (-1, 1): for b1, b2 in itertools.permutations([new_block1, new_block2], 2): if lcs_idx_by_block[b1] + search_offset < 0 or lcs_idx_by_block[b1] + search_offset >= len( b1.statements ): continue b1_unmatched = b1.statements[lcs_idx_by_block[b1] + search_offset] if b1_unmatched in prev_moved: continue unmatched_b2_positions = index_of_similar_stmts([b1_unmatched], b2.statements, all_positions=True) if unmatched_b2_positions is None: continue # b1_unmatched must be in b2 for b2_pos in unmatched_b2_positions: b2_stmt = b2.statements[b2_pos] if b2_stmt in prev_moved: continue if b2_pos + search_offset < 0 or b2_pos + search_offset >= len(b2.statements): continue # a stmt must be independent to be moveable if self.stmt_can_move_to( b2_stmt, b2, lcs_idx_by_block[b2] + search_offset, io_finder=io_finder_by_block[b2] ): # prev_stmts = b2.statements.copy() b2.statements.remove(b2_stmt) b2.statements.insert(lcs_idx_by_block[b2] + search_offset, b2_stmt) prev_moved.add(b2_stmt) prev_moved.add(b1_unmatched) # new_lcs, _ = longest_ail_subseq([b1.statements, b2.statements]) ## if changes make don't make the lcs longer, revert changes # if len(lcs) >= len(new_lcs): # b2.statements = prev_stmts updates = True break if updates: break if updates: break else: # no updates happen, we are ready to kill this search break graph_changed = False if new_block1.statements != block1.statements: replace_node_in_graph(graph, block1, new_block1) graph_changed = True if new_block2.statements != block2.statements: replace_node_in_graph(graph, block2, new_block2) graph_changed = True if graph_changed: return new_block1, new_block2 return block1, block2
[文档] def create_merged_subgraph(self, blocks, graph: nx.DiGraph, maximize_similarity=False) -> AILMergeGraph: # Before creating a full graph LCS, optimize the common seq between the starting blocks if maximize_similarity: # TODO: this is disabled by default right now because it's both slow and incorrect. It should # be fixed one day when we have a good SSA implementation. To test this, use the following: # https://github.com/mahaloz/sailr-eval/blob/d9f99b3521b60b9a1fd862d106b77e5664a9d175 # /tests/test_deoptimization.py#L130 blocks = list(self.maximize_similarity_of_blocks(blocks[0], blocks[1], graph)) else: blocks = list(blocks) # Eliminate all cases that may only have returns (we should do that in a later pass) all_only_returns = True for block in blocks: for stmt in block.statements: if not isinstance(stmt, (Return, Label)): all_only_returns = False break if not all_only_returns: break if all_only_returns: self.candidate_blacklist.add(tuple(blocks)) raise SAILRSemanticError("Both blocks only contain returns, this analysis must skip it") # Traverse both blocks subgraphs within the original graph and find the longest common AIL sequence. # Use one of the blocks subraphs to construct the top-half of the new merged graph that contains no inserted # conditions yet. This means the graph is still missing the divergence of the two graphs. try: graph_lcs = longest_ail_graph_subseq(blocks, graph) except SAILRSemanticError as e: self.candidate_blacklist.add(tuple(blocks)) raise e ail_merge_graph = AILMergeGraph(original_graph=graph) # some blocks in originals may update during this time (if-statements can change) update_blocks = ail_merge_graph.create_conditionless_graph(blocks, graph_lcs) if update_blocks is None: # failed to create the condition-less graph self.candidate_blacklist.add(tuple(blocks)) raise SAILRSemanticError("Failed to create a condition-less graph, this analysis must skip it") # # SPECIAL CASE: the merged graph contains only 1 node and no splits # allows for an early return without expensive computations # if len(ail_merge_graph.graph.nodes) == 1 and all( not splits for splits in ail_merge_graph.original_split_blocks.values() ): new_node = next(iter(ail_merge_graph.graph.nodes)) base_successor = next(iter(graph.successors(blocks[0]))) other_successor = next(iter(graph.successors(blocks[1]))) conditional_block, true_target = self._construct_best_condition_block_for_merge(blocks, graph) if true_target == blocks[0]: conditional_block.statements[-1].true_target.value = base_successor.addr conditional_block.statements[-1].false_target.value = other_successor.addr else: conditional_block.statements[-1].true_target.value = other_successor.addr conditional_block.statements[-1].false_target.value = base_successor.addr ail_merge_graph.graph.add_edge(new_node, conditional_block) return ail_merge_graph # we have now generated the top half of the merge graph. We now need to create a mapping for all the # merge_graph blocks to the original blocks from the two targets we are merging. This map will store # the AILBlockSplit if it is a split, so we can track preds and succss later. merge_end_pairs = ail_merge_graph.create_mapping_to_merge_graph(update_blocks, blocks) # collect the conditions # make a new conditional block conditional_block, true_target = self._construct_best_condition_block_for_merge(blocks, graph) true_target = ail_merge_graph.starts[0] if true_target is blocks[0] else ail_merge_graph.starts[1] ail_merge_graph.add_edges_to_condition(conditional_block, true_target, merge_end_pairs) return ail_merge_graph
[文档] def similar_conditional_when_single_corrected(self, block1: Block, block2: Block, graph: nx.DiGraph): cond1, cond2 = block1.statements[-1], block2.statements[-1] if not isinstance(cond1, ConditionalJump) or not isinstance(cond2, ConditionalJump): return False # conditions must match if not cond1.condition.likes(cond2.condition): return False # collect the true and false targets for the condition block_to_target_map = defaultdict(dict) for block, cond in ((block1, cond1), (block2, cond2)): for succ in graph.successors(block): if succ.addr == cond.true_target.value: block_to_target_map[block]["true_target"] = succ elif succ.addr == cond.false_target.value: block_to_target_map[block]["false_target"] = succ else: # exit early if you ever can't find a supposed target return False # check if at least one block in successors match mismatched_blocks = {} for target_type in block_to_target_map[block1]: t1_blk, t2_blk = block_to_target_map[block1][target_type], block_to_target_map[block2][target_type] if not is_similar(t1_blk, t2_blk, partial=True): mismatched_blocks[target_type] = {block1: t1_blk, block2: t2_blk} if len(mismatched_blocks) != 1: return False # We now know that at least one block matches # at this moment we have something that looks like this: # A ---> C <--- B # | | # V V # D E # # A and B both share the same condition, point to a block that is either similar to each # other or the same block, AND they have a mismatch block D & E. We want to make a new NOP # block that is between A->D and B->E to make a balanced merged graph: # # A ---> C <--- B # | | # V V # N -> D E <- N' # # We now will have a balanced merge graph for target_type, block_map in mismatched_blocks.items(): for src, dst in block_map.items(): # create a new nop block nop_blk = Block( self.new_block_addr(), 0, statements=[Jump(0, Const(0, 0, 0, self.project.arch.bits), 0, ins_addr=self.new_block_addr())], ) # point src -> nop -> dst graph.add_edge(src, nop_blk) graph.add_edge(nop_blk, dst) # unlink src -X-> dst graph.remove_edge(src, dst) # correct the targets of the src target = getattr(src.statements[-1], target_type) target.value = nop_blk.addr return True
@staticmethod def _has_single_successor_path(source, target, graph): if source not in graph or target not in graph: return [] if not nx.has_path(graph, source, target): return [] for simple_path in nx.all_simple_paths(graph, source, target, cutoff=10): for node in simple_path: if node is target or node is source: continue if graph.out_degree(node) != 1: break else: if simple_path[-1] is target: return simple_path return [] def _block_has_goto_edge(self, block: ailment.Block, other_ends, graph=None): # case1: # A -> (goto) -> B. # if goto edge coming from end block, from any instruction in the block # since instructions can shift... last_stmt = block.statements[-1] gotos = self._goto_manager.gotos_in_block(block) for goto in gotos: target_block = find_block_in_successors_by_addr(goto.dst_addr, block, graph) if any(self._has_single_successor_path(end, target_block, graph) for end in other_ends): return True # case2: # A.last (conditional) -> (goto) -> B -> C # # Some condition ends in a goto to one of the ends of the merge graph. In this case, # we consider it a modified version of case2 if graph: for pred in graph.predecessors(block): last_stmt = pred.statements[-1] if isinstance(last_stmt, ConditionalJump): gotos = self._goto_manager.gotos_in_block(pred) # TODO: this is only valid for duplication reverter, but it should be better if gotos and block.idx is not None: return True for goto in gotos: if goto.dst_addr in (block.addr, block.statements[0].ins_addr): return True for succ in graph.successors(block): last_stmt = succ.statements[-1] if isinstance(last_stmt, ConditionalJump): gotos = self._goto_manager.gotos_in_block(succ) # TODO: this is only valid for duplication reverter, but it should be better if gotos and block.idx is not None: return True for goto in gotos: for other_end in other_ends: found = False for other_succ in graph.successors(other_end): if other_succ.addr == goto.dst_addr: found = True if not found: break return False def _find_future_irreducible_gotos(self, max_endpoint_distance=5): """ Checks if these gotos could be fixed by eager returns """ endnodes = [node for node in self.out_graph.nodes() if self.out_graph.out_degree[node] == 0] blocks_by_addr = {blk.addr: blk for blk in self.out_graph.nodes()} bad_gotos = set() for goto in self._goto_manager.gotos: goto_end_block = blocks_by_addr.get(goto.dst_addr, None) # skip gotos that don't exist if not goto_end_block: continue # if a goto end is an endnode, then this is good! Skip it! if goto_end_block in endnodes: continue connects_endnode = False for endnode in endnodes: if ( goto_end_block in self.out_graph and endnode in self.out_graph and nx.has_path(self.out_graph, goto_end_block, endnode) ): try: next(nx.all_simple_paths(self.out_graph, goto_end_block, endnode, cutoff=max_endpoint_distance)) except StopIteration: continue # if we are here, a path exists connects_endnode = True break # if goto is connected, great, skip it! if connects_endnode: continue # if we are here, this goto is non_reducible bad_gotos.add(goto) return bad_gotos
[文档] def collect_conditions_between_nodes(self, graph, source: Block, sinks: list[Block], max_depth=15): graph_nodes = set(sinks) for sink in set(sinks): # we need to cutoff the maximum number of nodes that can be included in this search paths_between = nx.all_simple_paths(graph, source=source, target=sink, cutoff=max_depth) graph_nodes.update({node for path in paths_between for node in path}) full_condition_graph: nx.DiGraph = nx.DiGraph(nx.subgraph(graph, graph_nodes)) # destroy any edges which go to what is supposed to be the start node of the graph # which in effect removes loops (hopefully) while True: try: cycles = nx.find_cycle(full_condition_graph) except nx.NetworkXNoCycle: break full_condition_graph.remove_edge(*cycles[0]) # now that we have a full target graph, we want to know the condensed conditions that allow # control flow to get to that target end. We get the reaching conditions to construct a guarding # node later self._ri.cond_proc.recover_reaching_conditions(None, graph=full_condition_graph) conditions_by_start = {} for sink in sinks: if sink in self._ri.cond_proc.guarding_conditions: condition = self._ri.cond_proc.guarding_conditions[sink] elif sink in self._ri.cond_proc.reaching_conditions: condition = self._ri.cond_proc.reaching_conditions[sink] else: # TODO: this should be better fixed self.candidate_blacklist.add(tuple(sinks)) raise SAILRSemanticError( f"Unable to find the conditions for target: {sink}. " f"This is likely caused by unsupported statements, like Switches, being in the graph." ) condition = self._ri.cond_proc.simplify_condition(condition) if condition.is_true() or condition.is_false(): condition = self._ri.cond_proc.simplify_condition(self._ri.cond_proc.reaching_conditions[sink]) conditions_by_start[sink] = self._ri.cond_proc.convert_claripy_bool_ast(condition) return conditions_by_start
# # Search Stages # def _share_subregion(self, blocks: list[Block]) -> bool: return any(all(block.addr in region for block in blocks) for region in self._ri.regions_by_block_addrs) def _is_valid_candidate(self, b0, b1): # blocks must have statements if not b0.statements or not b1.statements: return False # blocks must share a region if not self._share_subregion([b0, b1]): return False # if not self.shared_common_conditional_dom([b0, b1], self.read_graph): # return False stmt_in_common = False # special case: when we only have a single stmt if len(b0.statements) == len(b1.statements) == 1: # Case 1: # [if(a)] == [if(b)] # # we must use the more expensive `similar` function to tell on the graph if they are # stmts that result in the same successors stmt_is_similar = is_similar(b0, b1, graph=self.read_graph) # Case 2: # [if(a)] == [if(a)] # and at least one child for the correct target type matches # TODO: this not not yet supported # update ether we resolved in the above cases if stmt_is_similar: stmt_in_common = True else: # check if these nodes share any stmt in common for stmt0 in b0.statements: # jumps don't count if isinstance(stmt0, Jump): continue # Most Assignments don't count just by themselves: # register = register # TOP = const | register if isinstance(stmt0, Assignment): src = stmt0.src.operand if isinstance(stmt0.dst, Convert) else stmt0.src if isinstance(src, Register) or (isinstance(src, Const) and src.bits > 2): continue for stmt1 in b1.statements: if is_similar(stmt0, stmt1, graph=self.write_graph): stmt_in_common = True break if stmt_in_common: break # must share a common dominator return stmt_in_common and self.shared_common_conditional_dom((b0, b1), self.write_graph) is not None @staticmethod def _construct_goto_related_subgraph(base: Block, graph: nx.DiGraph, max_ancestors=5): """ Creates a subgraph of the large graph starting from the base block and working upwards (predecessors) for max_ancestors amount of nodes """ blocks = [base] level_blocks = [base] block_lvls = {base: 0} new_level_blocks = [] for lvl in range(max_ancestors): new_level_blocks = [] for lblock in level_blocks: block_lvls[lblock] = lvl + 1 new_level_blocks += list(graph.predecessors(lblock)) blocks += new_level_blocks level_blocks = new_level_blocks # collect last level blocks if new_level_blocks: for new_block in new_level_blocks: if new_block in block_lvls: continue block_lvls[new_block] = max_ancestors + 1 # construct the final subgraph g = nx.subgraph(graph, blocks) return g, block_lvls def _find_initial_candidates(self) -> list[tuple[Block, Block]]: """ Here is how """ # first, find all the goto edges, since these locations will always be the base of the merge # graph we create; therefore, we only need search around gotos goto_edges = self._goto_manager.find_goto_edges(self.read_graph) goto_edges = sorted(goto_edges, key=lambda x: x[0].addr + x[1].addr) candidates = [] for goto_src, goto_dst in goto_edges: candidate_subgraph, dist_by_block = self._construct_goto_related_subgraph(goto_dst, self.read_graph) goto_candidates = [] for b0, b1 in combinations(candidate_subgraph, 2): if self._is_valid_candidate(b0, b1): pair = tuple(sorted([b0, b1], key=lambda x: x.addr)) goto_candidates.append(pair) # eliminate any that are already blacklisted goto_candidates = [c for c in goto_candidates if c not in self.candidate_blacklist] # re-sort candidates by address (for tiebreakers) goto_candidates = sorted(goto_candidates, key=lambda x: x[0].addr + x[1].addr, reverse=True) # choose only a single candidate for this goto, make it the one nearest to the head best = None best_dist = None for b0, b1 in goto_candidates: if best is None: best = (b0, b1) best_dist = dist_by_block[b0] + dist_by_block[b1] continue total_dist = dist_by_block[b0] + dist_by_block[b1] if total_dist > best_dist: best = (b0, b1) if best is not None: if best == (goto_src, goto_dst)[::-1]: # just flip it to normalize best = best[::-1] candidates.append(best) candidates = list(set(candidates)) candidates.sort(key=lambda x: x[0].addr + x[1].addr) return candidates def _filter_candidates(self, candidates, merge_candidates=True): """ Preform a series of filters on the candidates to reduce the fast set to an assured set of the duplication case we are searching for. """ # # filter out bad candidates from the blacklist # filted_candidates = [] id_blacklist = {((b0.addr, b0.idx), (b1.addr, b1.idx)) for b1, b0 in self.candidate_blacklist} for candidate in candidates: blk_id = ((candidate[0].addr, candidate[0].idx), (candidate[1].addr, candidate[1].idx)) rev_blk_id = blk_id[::-1] if blk_id not in id_blacklist and rev_blk_id not in id_blacklist: filted_candidates.append(candidate) candidates = filted_candidates # when enabled, attempts to merge candidates if merge_candidates: # # Now, merge pairs that may actually be n-pairs. This will look like multiple pairs having a single # block in common, and have one or more statements in common. # not_fixed = True while not_fixed: not_fixed = False queued = set() merged_candidates = [] # no merging needs to be done, there is only one candidate left if len(candidates) == 1: break for can0 in candidates: # skip candidates being merged if can0 in queued: continue for can1 in candidates: if can0 == can1 or can1 in queued: continue # only try a merge if candidates share a node in common if not set(can0).intersection(set(can1)): continue lcs, _ = longest_ail_subseq([b.statements for b in set(can0 + can1)]) if not lcs: continue merged_candidates.append(tuple(set(can0 + can1))) queued.add(can0) queued.add(can1) not_fixed |= True break remaining_candidates = [] for can in candidates: for m_can in merged_candidates: if not all(blk not in m_can for blk in can): break else: remaining_candidates.append(can) candidates = merged_candidates + remaining_candidates candidates = list(set(candidates)) candidates = [tuple(sorted(candidate, key=lambda x: x.addr)) for candidate in candidates] candidates = sorted(candidates, key=lambda x: sum(c.addr for c in x)) return candidates
[文档] def shared_common_conditional_dom(self, nodes, graph: nx.DiGraph): """ Takes n nodes and returns True only if all the nodes are dominated by the same node, which must be a ConditionalJump @param nodes: @param graph: @return: """ if graph not in self._entry_node_cache: entry_blocks = [node for node in graph.nodes if graph.in_degree(node) == 0] entry_block = None if len(entry_blocks) != 1 else entry_blocks[0] if entry_block is None: return None self._entry_node_cache[graph] = entry_block entry_blk = self._entry_node_cache[graph] if graph not in self._idom_cache: self._idom_cache[graph] = nx.algorithms.immediate_dominators(graph, entry_blk) idoms = self._idom_cache[graph] # first check if any of the node pairs could be a dominating loop b0, b1 = nodes[:] if dominates(idoms, b0, b1) or dominates(idoms, b1, b0): return None node = nodes[0] node_level = [node] seen_nodes = set() while node_level: # check if any of the nodes on the current level are dominators to all nodes for cnode in node_level: if not cnode.statements: continue if ( isinstance(cnode.statements[-1], ConditionalJump) and all(dominates(idoms, cnode, node) for node in nodes) and cnode not in nodes ): return cnode # if no dominators found, move up a level seen_nodes.update(set(node_level)) next_level = list(itertools.chain.from_iterable([list(graph.predecessors(cnode)) for cnode in node_level])) # only add nodes we have never seen node_level = set(next_level).difference(seen_nodes) return None