/* (c) Copyright 2004-2005, Cadence Design Systems, Inc.  All rights reserved. 

This file is part of the OA Gear distribution.  See the COPYING file in
the top level OA Gear directory for copyright and licensing information. */

/*
Author: Aaron P. Hurst <ahurst@eecs.berkeley.edu>
 
ChangeLog:
2006-12-26: ChangeLog started
*/

#include "oagMapperUtils.h"
#include "oagMapperTable.h"
#include "oagFunc.h"
#include "oagFuncModGraph.h"
#include "oagFuncSimMod.h"
#include "oagFuncSimOcc.h"
#include "oagFuncManager.h"
#include <float.h>

// #define DEBUG

#include "oagMapperDebug.h"

using namespace oagFunc;

namespace oagMapper {

// *****************************************************************************
// Table()
//
/// \brief Constructor.
///
/// \param cutsPerNode the maximum number of cuts generated per node
///
// *****************************************************************************
Table::Table(int cutsPerNode) {

    // set the maximum number of cuts generated per node
    this->cutsPerNode = cutsPerNode;

    gateCount = seqCount = 0;
    useAlternateViews = false;

    // create table for library gate cut functions
    addTrivialGates();

    // there must be enough bits in the table entry to store cut function
    assert((1<<MAX_CUT_SIZE) <= TableEntry::MAX_WORDS*TableEntry::BITS_PER_WORD);
   
    // prepare simulation vectors
    initializeSimulation();
}


// *****************************************************************************
// useAlternateView()
//
/// \brief Maps the design to a library view other than the one containing the functional description.
///
/// This must be set before any gates are added to the library.
///
/// \param viewName the alternate view name
///
// *****************************************************************************
void
Table::useAlternateView(const oa::oaScalarName &viewName) {
  useAlternateViews = true;
  alternateViewName = viewName;
}


// *****************************************************************************
// addTrivialGates()
//
/// \brief Adds trivial functions to match table.
///
/// Two types of functions are added: direct connections to an input (i.e. 
/// wires) and constant functions.
//
// *****************************************************************************
void
Table::addTrivialGates() {

  // for each cut width
  for(int i=0; i<=MAX_CUT_SIZE; i++) {

    // add constant zero entry
    TableEntry constantEntry;
    constantEntry.cell = NULL;
    bzero(constantEntry.func, sizeof(int)*constantEntry.MAX_WORDS);
    constantEntry.N_input = 0;
    constantEntry.P = 0;
    constantEntry.directFlag = 0;
    constantEntry.constantFlag = 1;
    tables[i].push_back(constantEntry);

    // add constant one entry
    for(int bit=0; bit<(1<<i); bit++) {
      constantEntry.setBit(bit, true);
    }
    tables[i].push_back(constantEntry);

    // add all direct connections
    for(int j=0; j<i; j++) {
      TableEntry wireEntry;
      wireEntry.cell = NULL;
      bzero(wireEntry.func, sizeof(int)*wireEntry.MAX_WORDS);
      wireEntry.directInput = j;
      wireEntry.P = 0;
      wireEntry.directFlag = 1;
      wireEntry.constantFlag = 0;
      for(int v=0; v<(1<<i); v++) {
        wireEntry.setBit(v, (v >> j) & 0x1);
      }
      tables[i].push_back(wireEntry);
    }
  }
}


// *****************************************************************************
// addLibraryGate()
//
/// \brief Characterizes a library gate and adds it to the match table.
///
/// If the gate does not have exactly one output bit, it is ignored.
///
/// The gate must be purely combinational.
///
/// All 2^|input| input complementations are also added to the table.
///
/// Input permutations are not yet supported.
///
/// \param design the library gate
// *****************************************************************************
void
Table::addLibraryGate(oa::oaDesign* design) {
  assert(design);

  oa::oaModule *module = design->getTopModule();
  assert(module);
  Manager *manager = oagFunc::Manager::get(design);
  if (!manager) {
    std::cout << "WARNING: Ignoring library cell.  "
              << "Either it is structural or is missing a functional description." << std::endl;
    return;
  }

#if defined(DEBUG)
  oa::oaString modString;
  module->getName(oa::oaVerilogNS(), modString);
  DEBUG_PRINTLN(modString);
#endif

  // find all input and output terminals
  std::vector<oa::oaModTerm*> inputTerms;
  std::vector<oa::oaModTerm*> outputTerms;
  oa::oaModTerm *term;
  oa::oaIter<oa::oaModTerm> termIter(module->getTerms(oacTermIterSingleBit));
  while((term = termIter.getNext())) {
    if (term->getTermType() == oa::oacInputTermType) {
      inputTerms.push_back(term);
    } else if (term->getTermType() == oa::oacOutputTermType) {
      outputTerms.push_back(term);
    }
  }

  // if this gate has zero or multiple outputs, ignore
  if (outputTerms.size() != 1) {
    std::cout << "WARNING: Ignoring library cell.  "
              << "Multiple outputs are currently unsupported." << std::endl;
    return;
  }
  // if this gate has zero inputs, ignore
  if (inputTerms.size() == 0) return;

  // find the AI nodes of the inputs and outputs
  oa::oaModBitNet *outputNet = oagFunc::toBitNet(outputTerms[0]->getNet());
  assert(outputNet);
  ModRef outputNode = ModGraph::getNetToAiConnection(outputNet);
  assert(!oagFunc::ModGraph::isNull(outputNode));

  int len = inputTerms.size();
  vector<oagFunc::ModRef> inputNodes(len);
  for(int i=0; i<len; i++) {
    oa::oaModBitNet *net = oagFunc::toBitNet(inputTerms[i]->getNet());
    inputNodes[i] = oagFunc::ModGraph::getNetToAiConnection(net);
    assert(!oagFunc::ModGraph::isNull(inputNodes[i]));
  }
  
  // check that there is enough room for the full truth table
  if (len > MAX_CUT_SIZE) {
    std::cout << "WARNING: Ignoring library cell.  "
              << "Too many inputs for the size of the truth tables." << std::endl;
    return;
  }
  assert(TableEntry::MAX_WORDS >= (1 << len)/TableEntry::BITS_PER_WORD);

  // find an alternate view, if specified
  oa::oaDesign *implementDesign = NULL;
  if (useAlternateViews) {
    // an alternate view was specified...
    oa::oaScalarName libName, viewName, cellName;
    design->getLibName(libName);
    design->getCellName(cellName);
    implementDesign = oa::oaDesign::find(libName, cellName, alternateViewName);
    // try opening from disk
    if (!implementDesign) {
      try {
        implementDesign = oa::oaDesign::open(libName, cellName, alternateViewName, 'r');
        assert(implementDesign);
      } catch (oa::oaException &e) {}
    }
    if (!implementDesign) {
      // couldn't open alternate view!
      oa::oaString viewString, cellString;
      alternateViewName.get(viewString);
      cellName.get(cellString);
      std::cerr << "ERROR: Could not find alternate view " << viewString 
                << " for cell " << cellString << endl;
      QUIT_ON_ERROR;
    }
  } else {
    implementDesign = design;
  }

  libraryCells.push_back(design);

  // N_input...
  for(unsigned char N_input=0; N_input<(1<<len); N_input++) {
    TableEntry t;
    t.cell = design;
    t.implementCell = implementDesign;
    bzero(t.func, sizeof(int)*t.MAX_WORDS);
    t.N_input = N_input;
    t.P = 0;         // input permutation unsupported
    t.directFlag = t.constantFlag = 0;
    simulate(design, t, outputNode, inputNodes);
    DEBUG_PRINTMORE(" ");
    tables[len].push_back(t);
    if (len <= 1) break; // don't bother with inverting 1-input gates
  }

  DEBUG_PRINTMORE("\n");
}


// *****************************************************************************
// initializeSimulation()
//
/// \brief Pre-computes the input vectors necessary for an exhaustive simulation.
//
// *****************************************************************************
void
Table::initializeSimulation() {
  bzero(exhaustiveInputVectors, sizeof(unsigned int)*(MAX_CUT_SIZE)*TableEntry::MAX_WORDS);
  for(int v=0; v<(1<<MAX_CUT_SIZE); v++) {
    assert(v/TableEntry::BITS_PER_WORD < TableEntry::MAX_WORDS);
    for(int i=0; i<MAX_CUT_SIZE; i++) {
      exhaustiveInputVectors[i][v/TableEntry::BITS_PER_WORD] |= ((v >> i) & 0x1) << (v%TableEntry::BITS_PER_WORD);
    }
  }
}


// *****************************************************************************
// simulate()
//
/// \brief Simulates the entire function of the logic in a cone between an input cut and a node.
///
/// \param design
/// \param outResult a table entry for the target logic cone
/// \param out the output vertex of the simulation cone (i.e. the node being mapped)
/// \param cut the input cut of the cone being simulated
//
// *****************************************************************************
void
Table::simulate(oa::oaDesign *design, 
                TableEntry & outResult, oagFunc::ModRef & out, 
                const vector<oagFunc::ModRef> cut) {
  assert(design);

  const int cutWidth = cut.size();

  // Note: because this is such a performance critical component of the mapper,
  // the logic cone between a cut and a mapping point is computed using a direct
  // call to the oagAi class instead of the ModGraph.  this avoids some slight
  // overhead in converting the oagAi::Refs to ModRefs

  // gather full cone
  static vector<oagAi::Ref> cone;
  cone.clear();
  static vector<oagAi::Ref> coneRoots;
  coneRoots.clear();
  coneRoots.reserve(cutWidth);
  for(int i=0; i<cutWidth; i++)
    coneRoots.push_back(cut[i].ref);
  Manager::get(design)->getGraph()->getFaninCone(out.ref, coneRoots, cone, false);
  // add back vertex
  cone.push_back(out.ref);
  
  // create simulation engine
  static SimMod sim(design);

  // repeat with as many words as necessary for the full truth table
  int bits = (1 << cutWidth);
  unsigned int word = 0;
  while(bits > 0) {
    assert(word < TableEntry::MAX_WORDS);
    for(int i=0; i<cutWidth; i++) {
      if ((outResult.N_input >> i) & 0x1) {
        sim.setVector(cut[i], ~exhaustiveInputVectors[i][word]);
      } else {
        sim.setVector(cut[i], exhaustiveInputVectors[i][word]);
      }
    }
    // simulate every node in the cone
    for(vector<oagAi::Ref>::iterator it = cone.begin(); it!=cone.end(); it++) {
      sim.runOne(ModRef(*it, out.module));
    }
    sim.getVector(out, outResult.func[word]);
    bits -= TableEntry::BITS_PER_WORD;
    word++;
  }

  // if the last result was a partial word, mask invalid bits
  if (bits < 0) {
    unsigned int mask = (0x1 << (bits + TableEntry::BITS_PER_WORD))-1;
    outResult.func[word-1] &= mask;
  }

#if defined(DEBUG)
  // print out the binary result
  for(unsigned int b=0; b<TableEntry::BITS_PER_WORD*TableEntry::MAX_WORDS; b++)
    cout << (outResult.getBit(b) ? "1" : "0");
#endif
}


// *****************************************************************************
// setAreaCosts()
//
/// \brief Sets the cost of each match to be the area of the gate.
///
/// The area is the bounding box of the block, if defined, otherwise
/// the number of inputs.
///
// *****************************************************************************
void
Table::setAreaCosts() {
  
  // iterate through all table entries
  for(int i=0; i<=MAX_CUT_SIZE; i++) {
    for(unsigned int j=0; j<tables[i].size(); j++) {
      TableEntry *choice = &(tables[i][j]);
        
      // is a trivial mapping?
      if (choice->directFlag || choice->constantFlag) {
        choice->cost = 0.0;
        continue;
      }
      assert(choice->implementCell);

      // has a top block?
      oa::oaBlock *topBlock;
      if ((topBlock = choice->implementCell->getTopBlock())) {
        // has a bounding box?
        oa::oaBox box;
        topBlock->getBBox(box);
        choice->cost = box.getWidth()*box.getHeight();
        if (choice->cost > 0) continue;

        // count input terms
        oa::oaTerm *term;
        oa::oaIter<oa::oaTerm> termIter = topBlock->getTerms(oacTermIterSingleBit);
        int numInputs = 0;
        while((term = termIter.getNext())) {
          if (term->getTermType() == oa::oacInputTermType) {
            numInputs++;
          }
        }
        choice->cost = numInputs;
        continue;
      }

      // has a top module?
      oa::oaModule *topModule;
      if ((topModule = choice->implementCell->getTopModule())) {
        // count input terms
        oa::oaModTerm *term;
        oa::oaIter<oa::oaModTerm> termIter = topModule->getTerms(oacTermIterSingleBit);
        int numInputs = 0;
        while((term = termIter.getNext())) {
          if (term->getTermType() == oa::oacInputTermType) {
            numInputs++;
          }
        }
        choice->cost = numInputs;
        continue;
      }
      
      cerr << "ERROR: Library cell has neither top block nor top module" << endl;
      QUIT_ON_ERROR;
    }
  }
}



// *****************************************************************************
// setDelayCosts()
//
/// \brief Sets the cost of each match to be the delay of the gate.
///
/// All gates are assumed to have a delay of 1.0.  The resulting mapping will
/// be optimal in the number of levels.
///
// *****************************************************************************
void
Table::setDelayCosts() {
  
  // iterate through all table entries
  for(int i=0; i<=MAX_CUT_SIZE; i++) {
    for(unsigned int j=0; j<tables[i].size(); j++) {
      TableEntry *choice = &(tables[i][j]);
        
      // is a trivial mapping?
      if (choice->directFlag || choice->constantFlag) {
        choice->cost = 0.0;
        continue;
      }

      choice->cost = 1.0;
    }
  }
}


// *****************************************************************************
// getCumulativeAreaCost()
//
/// \brief Returns the area cost of a match and the necessary input cone.
///
/// This assumes includes all of the inputs in the cost.  If the inputs have
/// other fanouts, this may result in double counting.
///
/// \param cut
/// \param choice
/// \return cost
// *****************************************************************************
double
Table::getCumulativeAreaCost(oagFunc::ModGraph::Cut *cut, TableEntry *choice) {
  assert(cut);
  assert(choice);

  // 1. cost of choice
  double cost = choice->cost;
  // 2. cost of inputs
  int i=0;
  for(ModGraph::Cut::iterator it = cut->begin(); it!= cut->end(); it++) {
    if ((choice->N_input >> i) & 0x1) {
      // inverted
      assert(cost_n.find(*it) != cost_n.end());
      cost += cost_n[*it];
    } else {
      // non-inverted
      assert(cost_p.find(*it) != cost_p.end());
      cost += cost_p[*it];
    }
    i++;
  }
  return cost;
}


// *****************************************************************************
// getCumulativeDelayCost()
//
/// \brief Returns the total delay cost of a match and the necessary inputs cone.
///
/// \param cut
/// \param choice
/// \return cost
// *****************************************************************************
double
Table::getCumulativeDelayCost(oagFunc::ModGraph::Cut *cut, TableEntry *choice) {
  assert(cut);
  assert(choice);

  // 1. cost of inputs
  double cost = 0.0;
  int i=0;
  for(ModGraph::Cut::iterator it = cut->begin(); it!= cut->end(); it++) {
    if ((choice->N_input >> i) & 0x1) {
      // inverted
      assert(cost_n.find(*it) != cost_n.end());
      cost >?= cost_n[*it];
    } else {
      // non-inverted
      assert(cost_p.find(*it) != cost_p.end());
      cost >?= cost_p[*it];
    }
    i++;
  }
  return cost + choice->cost;
}


// *****************************************************************************
// techmapArea()
//
/// \brief Maps a design to the specified gates to minimize area.
///
/// All of the functional behavior of the provided design is implemented using
/// structural library components.
///
/// The result will only be area-optimal for tree structures.  If nodes have
/// multiple fan-outs, the area required to implement that node will be counted
/// for each.  One easy potential improvement is that when a particular
/// mapping is chosen, to update the area cost of using that choice to 0 and
/// recompute the mapping of the other fan-outs.
/// 
/// \param target
// *****************************************************************************
void
Table::techmapArea(oa::oaModule *target) {
    assert(target);

    cout << "Mapping for area..." << endl;
    
    // has a set of library cells been supplied?
    if (libraryCells.empty()) {
      cerr << "ERROR: Empty combinational library.  Aborting." << endl;
      return;
    }

    oagFunc::Manager *currentManager = oagFunc::Manager::get(target->getDesign());
    if (currentManager->isStructural()) {
        // the design is already entirely structural
        cerr << "ERROR: Target is already structural or is missing functional "
             << "description.  Aborting." << endl;
        return;
    }

    // initialize
    ModGraph::clearKfeasibleCuts(target);
    cut_p.clear(); cut_n.clear();
    choice_p.clear(); choice_n.clear();
    cost_p.clear(); cost_n.clear();
    totalArea = totalDelay = 0;

    // check for combinational cycles
    if (ModGraph::hasCombinationalCycle(target)) {
      cerr << "ERROR: Target graph has a combinational cycle" << endl;
      QUIT_ON_ERROR;
    }

    // set cost of all mapping choices to area
    setAreaCosts();

    // identify not (and corresponding table entry)
    if (!mapUtils.notGate) {
      mapUtils.identifyNot(libraryCells);
    }
    if (!mapUtils.notGate) {
      cerr << "ERROR : Could not identify inverter in cell library" << endl;
      QUIT_ON_ERROR;
    }
    mapUtils.identifyNotTerminals();
    notEntry = NULL;
    for(unsigned int i = 0; i < tables[1].size(); i++) {
      if (tables[1][i].cell == mapUtils.notGate) {
        notEntry = &(tables[1][i]);
        break;
      }
    }
    assert(notEntry);

    // identify seq gate
    if (!mapUtils.seqGate) {
      cerr << "ERROR : Could not sequential gate in cell library" << endl;
      QUIT_ON_ERROR;
    }    
    mapUtils.identifySeqTerminalsByName();

    // strip reset nets
    mapUtils.identifyControls(target);
    mapUtils.removeAsyncResetsFromLogic(target);

    // handle constant nodes and nets
    ModRef constantZero = ModGraph::constantZero(target);
    oa::oaModNet *constantNet = oa::oaModNet::find(target, 
                                                   oa::oaScalarName(oa::oaNativeNS(), "tie0"));    
    if (constantNet) {
      ModGraph::setTerminalDriver(ModGraph::getNetToAiConnection(toBitNet(constantNet)),
                                  constantZero);
    }
    constantNet = oa::oaModNet::find(target, 
                                     oa::oaScalarName(oa::oaNativeNS(), "tie1"));
    if (constantNet) {
      ModGraph::setTerminalDriver(ModGraph::getNetToAiConnection(toBitNet(constantNet)),
                                  ModGraph::constantOne(target));
    }

    // collect the points that must be mapped
    //    1. primary outputs
    //    2. next state inputs of sequential nodes
    list<ModRef> outputsToMap, seqNodes;
    ModGraph::getOutputs(target, outputsToMap);
    ModGraph::getLocalStates(target, seqNodes);
    for(list<ModRef>::iterator it = seqNodes.begin();
        it != seqNodes.end(); it++) {
      if (!ModGraph::isNull(ModGraph::getNextState(*it))) {
        outputsToMap.push_back(ModGraph::getNextState(*it));
      }
    }
    // collect all fan-ins of these points
    vector<ModRef> nodesToMap;
    ModGraph::getTransitiveFanin(outputsToMap, nodesToMap, true);
    for(list<ModRef>::iterator it = outputsToMap.begin();
        it != outputsToMap.end(); it++) {
      nodesToMap.push_back(*it);
    }

    // --- PHASE 1 : collect best choice for each node

    cout << "\tnum. nodes to map = " << nodesToMap.size() << endl;
    cout << "\t(i) collecting best choice for each node";
    cout.flush();

    int m = 0;
    static_cast<void>(m);

    // map each point
    for(vector<ModRef>::iterator it = nodesToMap.begin();
        it != nodesToMap.end(); it++) {
      ModRef mapPoint = ModGraph::getNonInverted(*it);
      ModGraph::Cut selfCut;
      selfCut.insert(selfCut.begin(), mapPoint);

      // progress indicator (one dot = 1000 points mapped)
#if !defined(DEBUG)
      m++;
      if (m%1000 == 0) { cout << "."; cout.flush(); }
#else
      DEBUG_PRINT("Mapping node ");
      mapPoint.print();
#endif

      if (mapPoint == constantZero) {
        cost_p[mapPoint] = 0;
        cost_n[mapPoint] = 0;
      } else if (ModGraph::isTerminal(mapPoint) && ModGraph::isNull(ModGraph::getTerminalDriver(mapPoint)) ||
               ModGraph::isSequential(mapPoint)) {
        cost_p[mapPoint] = 0;
        cut_n[mapPoint] = selfCut;
        choice_n[mapPoint] = notEntry;
        cost_n[mapPoint] = notEntry->cost;
      } else {
        // explore all possible k-feasible cuts
        // ModGraph::clearKfeasibleCuts(target);
        ModGraph::CutSet cuts = ModGraph::enumerateKfeasibleCuts(mapPoint, MAX_CUT_SIZE, cutsPerNode, -1, true);

        // choose best "p" and "n" cuts and choices
        double minCost_p = DBL_MAX,
          minCost_n = DBL_MAX;
        ModGraph::Cut *bestCut_p = NULL,
          *bestCut_n = NULL;
        TableEntry *bestChoice_p = NULL,
          *bestChoice_n = NULL;
        
        for(list<ModGraph::Cut>::iterator cut_it = cuts.begin(); cut_it != cuts.end(); cut_it++) {
          ModGraph::Cut *currentCut = &(*cut_it);
          assert(currentCut);
          int cutWidth = currentCut->size();
          assert(cutWidth <= MAX_CUT_SIZE);

          // ignore self-cut
          if (cutWidth == 1 && *(currentCut->begin()) == mapPoint) continue;

          // construct cut array
          DEBUG_PRINT("\tcut {");
          int p = 0;
          vector<ModRef> cutArray(cutWidth);
          for(set<ModRef>::iterator cut_it = currentCut->begin(); cut_it != currentCut->end(); cut_it++) {
            DEBUG_PRINTMORE(cut_it->ref << ", ");
            assert(p < cutWidth);
            cutArray[p++] = *cut_it;
          }
          DEBUG_PRINTMORE("} ");

          // compute cut function
          TableEntry cutFunction;
          simulate(target->getDesign(), cutFunction, mapPoint, cutArray);
          // compute complement of cut function
          TableEntry complementFunction;
          memcpy(&complementFunction, &cutFunction, sizeof(TableEntry));
          for(int bit=0; bit<(1<<cutWidth); bit++) {
            complementFunction.setBit(bit, !complementFunction.getBit(bit));
          }

          // match cut function
          for(vector<TableEntry>::iterator lib_it = tables[cutWidth].begin(); 
              lib_it != tables[cutWidth].end(); lib_it++) {
            TableEntry *currentChoice = &(*lib_it);
            assert(currentChoice);
            
            // with non-inverted function...
            if (currentChoice->match(cutFunction)) {
              // print name
              if (currentChoice->cell) {
                oa::oaScalarName gateName;
                currentChoice->cell->getCellName(gateName);
                oa::oaString gateString;
                gateName.get(gateString);
                DEBUG_PRINTMORE(" " << gateString);
              } else if (currentChoice->directFlag) {
                DEBUG_PRINTMORE(" WIRE");
              } else if (currentChoice->constantFlag) {
                DEBUG_PRINTMORE(" CONST");
              }
              // keep best match
              if (getCumulativeAreaCost(currentCut, currentChoice) < minCost_p) {
                minCost_p = getCumulativeAreaCost(currentCut, currentChoice);
                bestChoice_p = currentChoice;
                bestCut_p = currentCut;
              }
            }

            // with inverted function...
            if (currentChoice->match(complementFunction)) {
              // print name
              if (currentChoice->cell) {
                oa::oaScalarName gateName;
                currentChoice->cell->getCellName(gateName);
                oa::oaString gateString;
                gateName.get(gateString);
                DEBUG_PRINTMORE(" !" << gateString);
              } else if (currentChoice->directFlag) {
                DEBUG_PRINTMORE(" !WIRE");
              } else if (currentChoice->constantFlag) {
                DEBUG_PRINTMORE(" !CONST");
              }
              // keep best match
              if (getCumulativeAreaCost(currentCut, currentChoice) < minCost_n) {
                minCost_n = getCumulativeAreaCost(currentCut, currentChoice);
                bestChoice_n = currentChoice;
                bestCut_n = currentCut;
              }
            }
          }

          DEBUG_PRINTMORE(endl);
        }

        // could we not find a mapping?
        if (!bestChoice_p && !bestChoice_n) {
          cerr << endl << "ERROR: Could not find a mapping for ";
          mapPoint.print();
          cerr << "       This shouldn't happen.  Perhaps the cut count/depth is too small?" << endl;
          QUIT_ON_ERROR;
        }

        // lastly, check self-cuts (i.e. just using an inverter)
        if (minCost_p + notEntry->cost < minCost_n) {
          minCost_n = minCost_p + notEntry->cost;
          bestChoice_n = notEntry;
          bestCut_n = &selfCut;
          DEBUG_PRINTLN("\tself-cut {} !p");
        }
        if (minCost_n + notEntry->cost < minCost_p) {
          minCost_p = minCost_n + notEntry->cost;
          bestChoice_p = notEntry;
          bestCut_p = &selfCut;
          DEBUG_PRINTLN("\tself-cut {} !n");
        }

        DEBUG_PRINTLN("\tp-cost = " << minCost_p << " n-cost = " << minCost_n);
        cost_p[mapPoint] = minCost_p;
        cost_n[mapPoint] = minCost_n;
        assert(bestChoice_p);
        choice_p[mapPoint] = bestChoice_p;
        assert(bestChoice_n);
        choice_n[mapPoint] = bestChoice_n;
        assert(bestCut_p);
        cut_p[mapPoint] = *bestCut_p;
        assert(bestCut_n);
        cut_n[mapPoint] = *bestCut_n;
      }
    }

    // --- PHASE 2 : implement choices

    cout << endl << "\t(ii) implementing best global choices" << endl;
    implementAll(target);

    // PHASE 3 : clean up

    cout << "\t(iii) cleaning up" << endl;
    ModGraph::clearKfeasibleCuts(target);

    // make module structural
    oa::oaModNet *net;
    oa::oaIter<oa::oaModNet> netIter(target->getNets(oacNetIterSingleBit));
    while((net = netIter.getNext())) {
      ModRef termNode = ModGraph::getNetToAiConnection(toBitNet(net));
      if (ModGraph::isTerminal(termNode)) {
        ModGraph::detach(termNode);
      }
    }
    // delete manager if this is the only module
    if (target->getDesign()->getModules().getCount() == 1) {
      Manager::destroy(target->getDesign());
    }

    cout << "\tarea cost = " << totalArea << endl;
}


// *****************************************************************************
// implementAll()
//
/// \brief Implements mapping choices as physical library gates.
///
/// Starts at the outputs and next state inputs and implements all necessary
/// gates to compute those functions.
///
/// \param target
//
// *****************************************************************************
void
Table::implementAll(oa::oaModule *target) {
    assert(target);

    list<ModRef> toBeImplemented;

    // make all bus nets and terms explicit
    oa::oaModNet *net;
    oa::oaIter<oa::oaModNet> explicitNetIter(target->getNets(oacNetIterNotImplicit));
    while((net = explicitNetIter.getNext())) {
      assert(net);
      if(!net->isImplicit() && net->getNumBits() > 1) {
        net->scalarize();
      }
    }
    oa::oaModTerm *term;
    oa::oaIter<oa::oaModTerm> explicitTermIter(target->getTerms(oacTermIterNotImplicit));
    while((term = explicitTermIter.getNext())) {
      assert(term);
      if(!term->isImplicit() && term->getNumBits() > 1) {
        term->scalarize();
      }
    }

    // map all existings wires
    oa::oaIter<oa::oaModNet> netIter(target->getNets(oacNetIterSingleBit));
    while((net = netIter.getNext())) {
      oa::oaModBitNet *bitNet = toBitNet(net);
      assert(bitNet);
      ModRef ref = ModGraph::getNetToAiConnection(bitNet);
      if (ModGraph::isNull(ref)) continue;
      assert(ModGraph::isTerminal(ref));
      if (!ModGraph::isNull(ModGraph::getTerminalDriver(ref))) continue;
      mapped[ref] = bitNet;
    }

    // pre-map sequential nodes
    map<ModRef, oa::oaModInst*> seqInsts;
    list<ModRef> outputsToMap, seqNodes;
    ModGraph::getLocalStates(target, seqNodes);
    for(list<ModRef>::iterator it = seqNodes.begin();
        it != seqNodes.end(); it++) {
      seqInsts[*it] = implementSeqNode(*it);
    }

    // map sequential nodes
    for(list<ModRef>::iterator it = seqNodes.begin();
        it != seqNodes.end(); it++) {
      ModRef nextState = ModGraph::getNextState(*it);
      if (!ModGraph::isNull(nextState)) {
        oa::oaModBitNet *mapped_in = implementNode(nextState);
        assert(mapped_in);
        oa::oaModInstTerm::create(mapped_in, seqInsts[*it], mapUtils.seqInput);
      }
    }

    // map primary outputs
    oa::oaIter<oa::oaModTerm> termIter(target->getTerms(oacTermIterSingleBit));
    while((term = termIter.getNext())) {
      oa::oaModBitNet *oldNet = toBitNet(term->getNet());
      assert(oldNet);

      ModRef ref = ModGraph::getNetToAiConnection(oldNet);
      assert(!ModGraph::isNull(ref));
      oa::oaModBitNet *newNet = implementNode(ref);
      assert(newNet);

      term->moveToNet(newNet);
    }
}


// *****************************************************************************
// implementSeqNode()
//
/// \brief Maps a sequential node to a library gates.
///
/// This routine only creates the SEQUENTIAL nodes and does not recurse
/// on the next state inputs.
///
/// \param ref
/// \return the module instance of the corresponding sequential cell
// *****************************************************************************
oa::oaModInst *
Table::implementSeqNode(oagFunc::ModRef ref) {
  oa::oaModule *currentModule = ref.module;
  assert(currentModule);

  // create new sequential gate
  oa::oaModScalarInst *inst = oa::oaModScalarInst::create(currentModule, mapUtils.seqGate);

  // create output net
  oa::oaModBitNet *mapped_out = oa::oaModScalarNet::create(currentModule);
  assert(!mapped_out->isImplicit());
  oa::oaModInstTerm::create(mapped_out, inst, mapUtils.seqOutput);
  mapped[ref] = mapped_out;
  
  // connect clocks, resets, etc.
  mapUtils.connectAllControls(ref, inst);
  
  return inst;
}


// *****************************************************************************
// implementNode()
//
/// \brief Maps a combinational node to a library gates.
///
/// This function is recursive.  It will pick the best choice for a particular
/// node and then continue by implementing the best choices for all of the
/// required inputs.
///
/// If the reference is inverted, then the inverted function will be implemented
/// and returned.
///
/// \param ref
/// \return the output module bit net that implements the function
// *****************************************************************************
oa::oaModBitNet *
Table::implementNode(oagFunc::ModRef ref) {

  // has this node already been mapped?
  if (mapped.find(ref) != mapped.end()) {
    return mapped[ref];
  }

  DEBUG_PRINTLN("Implementing node " << ref.ref);

  oa::oaModule *currentModule = ref.module;
  assert(currentModule);

  // is this a dangling node?
  if (ModGraph::isTerminal(ref) && !ModGraph::isInverted(ref) && 
      ModGraph::isNull(ModGraph::getTerminalDriver(ref)) ||
      ModGraph::isAnd(ref) && ModGraph::isNull(ModGraph::getAndRight(ref)) &&
      ModGraph::isNull(ModGraph::getAndLeft(ref))) {
    // default action: connect to constant zero
    oa::oaModNet *constantNet = oa::oaModNet::find(currentModule, 
                                                   oa::oaScalarName(oa::oaNativeNS(), "tie0"));
    assert(constantNet);
    mapped[ref] = toBitNet(constantNet);
    DEBUG_PRINTLN("\tfloating... tying to zero");
    return toBitNet(constantNet);
  }

  // is this a constant node?
  if (ref == ModGraph::constantZero(currentModule)) {
    oa::oaModNet *constantNet = oa::oaModNet::find(currentModule, 
                                                   oa::oaScalarName(oa::oaNativeNS(), "tie0"));
    assert(constantNet);
    mapped[ref] = toBitNet(constantNet);
    DEBUG_PRINTLN("\tconstant zero");
    return toBitNet(constantNet);
  } else if (ref == ModGraph::constantOne(currentModule)) {
    oa::oaModNet *constantNet = oa::oaModNet::find(currentModule, 
                                                   oa::oaScalarName(oa::oaNativeNS(), "tie1"));
    assert(constantNet);
    mapped[ref] = toBitNet(constantNet);
    DEBUG_PRINTLN("\tconstant one");
    return toBitNet(constantNet);
  }

  // is this an inverted ref? return with "p" or "n" version of node
  TableEntry    *choice;
  ModGraph::Cut *cut;
  if (ModGraph::isInverted(ref)) {
    // "n" mapping
    choice = choice_n[ModGraph::getNonInverted(ref)];
    cut = &(cut_n[ModGraph::getNonInverted(ref)]);
    totalDelay >?= cost_n[ModGraph::getNonInverted(ref)];
  } else {
    // "p" mapping
    choice = choice_p[ModGraph::getNonInverted(ref)];
    cut = &(cut_p[ModGraph::getNonInverted(ref)]);
    totalDelay >?= cost_p[ModGraph::getNonInverted(ref)];
  }
  if (!choice || !cut) {
    ref.print();
    oa::oaModBitNet *net = ModGraph::getNetToAiConnection(ref);
    oa::oaString n;
    net->getName(oa::oaVerilogNS(), n);
    cout << n << endl;
  }
  assert(choice);
  assert(cut);

  // is this a trivial mapping?
  if (choice->directFlag) {
    // return the wire that is connected to the direct input
    int inputNum = choice->directInput;
    for(ModGraph::Cut::iterator it = cut->begin(); it != cut->end(); it++) {
      if (!inputNum) {
        assert(*it != ref);
        DEBUG_PRINTLN("\twire");
        return implementNode(*it);
      }
      --inputNum;
    }
    assert(false);
  }
  if (choice->constantFlag) {
    // return the constant 0 or 1 wire
    oa::oaModNet *constantNet;
    if (choice->func[0] & 0x1) {
      constantNet = oa::oaModNet::find(currentModule, oa::oaScalarName(oa::oaNativeNS(), "tie1"));
    } else {
      constantNet = oa::oaModNet::find(currentModule, oa::oaScalarName(oa::oaNativeNS(), "tie0"));
    }
    assert(constantNet);
    mapped[ref] = toBitNet(constantNet);
    DEBUG_PRINTLN("\tconstant");
    return toBitNet(constantNet);
  }

  // create logic gate
  assert(choice->implementCell);
  oa::oaModScalarInst *inst = oa::oaModScalarInst::create(currentModule, choice->implementCell);
  totalArea += choice->cost;
#if defined(DEBUG)
  oa::oaString gateString;
  choice->implementCell->getCellName(oa::oaVerilogNS(), gateString);
  DEBUG_PRINTLN("\tgate " << gateString);
#endif

  // map inputs
  int num = 0;
  oa::oaModTerm *term, *outputTerm = NULL;
  oa::oaIter<oa::oaModTerm> termIter(choice->implementCell->getTopModule()->getTerms(oacTermIterSingleBit));
  for(ModGraph::Cut::iterator it = cut->begin(); it != cut->end(); it++, num++) {
    ModRef cutRef = *it;
    assert(!ModGraph::isNull(cutRef));
    assert(!ModGraph::isInverted(cutRef));

    // is this input inverted in the mapping?
    if ((choice->N_input >> num) & 0x1) {
      cutRef = ModGraph::notOf(cutRef);
    }
    // was an inverter used to map _p <=> _n?
    if (choice == notEntry && cutRef == ref) {
      cutRef = ModGraph::notOf(cutRef);
    }
    assert(cutRef != ref);

    // get corresponding instTerm
    while((term = termIter.getNext())) {
      if (term->getTermType() == oa::oacInputTermType) {
        break;
      } else if (term->getTermType() == oa::oacOutputTermType) {
        assert(!outputTerm); // can only handle gates with one output
        outputTerm = term;
      }
    }
    assert(term);

    oa::oaModBitNet *mapped_in = implementNode(cutRef);
    assert(mapped_in);
    assert(!mapped_in->isImplicit());
    oa::oaModInstTerm::create(mapped_in, inst, term);
  }

  // find output term (if not already found)
  while(!outputTerm && (term = termIter.getNext())) {
    if (term->getTermType() == oa::oacOutputTermType) {
      outputTerm = term;
    }
  }
  assert(outputTerm);
  
  // create output net
  oa::oaModBitNet *mapped_out = oa::oaModScalarNet::create(currentModule);
  assert(!mapped_out->isImplicit());
  oa::oaModInstTerm::create(mapped_out, inst, outputTerm);

  mapped[ref] = mapped_out;
  return mapped_out;
}


// *****************************************************************************
// techmapDelay()
//
/// \brief Maps a design to the specified gates to minimize delay.
///
/// All of the functional behavior of the provided design is implemented using
/// structural library components.
///
/// \param target
// *****************************************************************************
void
Table::techmapDelay(oa::oaModule *target) {
    assert(target);

    cout << "Mapping for delay..." << endl;

    // does the target have any functionality?
    oagFunc::Manager *currentManager = oagFunc::Manager::get(target->getDesign());
    if (currentManager->isStructural()) {
        // the design is already entirely structural
        cerr << "ERROR: Target is already structural or is missing functional "
             << "description.  Aborting." << endl;
        return;
    }

    // has a set of library cells been supplied?
    if (libraryCells.empty()) {
      cerr << "ERROR: Empty combinational library.  Aborting." << endl;
      return;
    }

    // initialize
    ModGraph::clearKfeasibleCuts(target);
    cut_p.clear(); cut_n.clear();
    choice_p.clear(); choice_n.clear();
    cost_p.clear(); cost_n.clear();
    totalArea = totalDelay = 0;

    // check for combinational cycles
    if (ModGraph::hasCombinationalCycle(target)) {
      cerr << "ERROR: Target graph has a combinational cycle" << endl;
      // Manager *m = Manager::get(target);
      // m->getGraph()->print();
      QUIT_ON_ERROR;
    }

    // set cost of all mapping choices to area
    setDelayCosts();

    // identify not (and corresponding table entry)
    if (!mapUtils.notGate) {
      mapUtils.identifyNot(libraryCells);
    }
    if (!mapUtils.notGate) {
      cerr << "ERROR: Could not identify inverter in cell library" << endl;
      QUIT_ON_ERROR;
    }
    mapUtils.identifyNotTerminals();
    notEntry = NULL;
    for(unsigned int i = 0; i < tables[1].size(); i++) {
      if (tables[1][i].cell == mapUtils.notGate) {
        notEntry = &(tables[1][i]);
        break;
      }
    }
    assert(notEntry);

    // identify seq gate
    if (!mapUtils.seqGate) {
      cerr << "ERROR : Could not sequential gate in cell library" << endl;
      QUIT_ON_ERROR;
    }    
    mapUtils.identifySeqTerminalsByName();

    // strip reset nets
    mapUtils.identifyControls(target);
    mapUtils.removeAsyncResetsFromLogic(target);

    // handle constant nodes and nets
    ModRef constantZero = ModGraph::constantZero(target);
    oa::oaModNet *constantNet = oa::oaModNet::find(target, 
                                                   oa::oaScalarName(oa::oaNativeNS(), "tie0"));    
    if (constantNet) {
      ModGraph::setTerminalDriver(ModGraph::getNetToAiConnection(toBitNet(constantNet)),
                                  constantZero);
    }
    constantNet = oa::oaModNet::find(target, 
                                     oa::oaScalarName(oa::oaNativeNS(), "tie1"));
    if (constantNet) {
      ModGraph::setTerminalDriver(ModGraph::getNetToAiConnection(toBitNet(constantNet)),
                                  ModGraph::constantOne(target));
    }

    // collect the points that must be mapped
    //    1. primary outputs
    //    2. next state inputs of sequential nodes
    list<ModRef> outputsToMap, seqNodes;
    ModGraph::getOutputs(target, outputsToMap);
    ModGraph::getLocalStates(target, seqNodes);
    for(list<ModRef>::iterator it = seqNodes.begin();
        it != seqNodes.end(); it++) {
      if (!ModGraph::isNull(ModGraph::getNextState(*it))) {
        outputsToMap.push_back(ModGraph::getNextState(*it));
      }
    }
    // collect all fan-ins of these points
    vector<ModRef> nodesToMap;
    ModGraph::getTransitiveFanin(outputsToMap, nodesToMap, true);
    for(list<ModRef>::iterator it = outputsToMap.begin();
        it != outputsToMap.end(); it++) {
      nodesToMap.push_back(*it);
    }

    // --- PHASE 1 : collect best choice for each node

    cout << "\tnum. nodes to map = " << nodesToMap.size() << endl;
    cout << "\t(i) collecting best choice for each node";
    cout.flush();

    int m = 0;
    static_cast<void>(m);

    // map each point
    for(vector<ModRef>::iterator it = nodesToMap.begin();
        it != nodesToMap.end(); it++) {
      ModRef mapPoint = ModGraph::getNonInverted(*it);
      ModGraph::Cut selfCut;
      selfCut.insert(selfCut.begin(), mapPoint);

      // progress indicator
#if !defined(DEBUG)
      m++;
      if (m%1000 == 0) { cout << "."; cout.flush(); }
#else
      DEBUG_PRINT("Mapping node ");
      mapPoint.print();
#endif

      if (mapPoint == constantZero) {
        cost_p[mapPoint] = 0;
        cost_n[mapPoint] = 0;
      } else if (ModGraph::isTerminal(mapPoint) && ModGraph::isNull(ModGraph::getTerminalDriver(mapPoint)) ||
               ModGraph::isSequential(mapPoint)) {
        cost_p[mapPoint] = 0;
        cut_n[mapPoint] = selfCut;
        choice_n[mapPoint] = notEntry;
        cost_n[mapPoint] = notEntry->cost;
      } else {
        // explore all possible k-feasible cuts
        // ModGraph::clearKfeasibleCuts(target);
        ModGraph::CutSet cuts = ModGraph::enumerateKfeasibleCuts(mapPoint, MAX_CUT_SIZE, cutsPerNode, -1, true);

        // choose best "p" and "n" cuts and choices
        double minCost_p = DBL_MAX,
          minCost_n = DBL_MAX;
        ModGraph::Cut *bestCut_p = NULL,
          *bestCut_n = NULL;
        TableEntry *bestChoice_p = NULL,
          *bestChoice_n = NULL;
        
        for(list<ModGraph::Cut>::iterator cut_it = cuts.begin(); cut_it != cuts.end(); cut_it++) {
          ModGraph::Cut *currentCut = &(*cut_it);
          assert(currentCut);
          int cutWidth = currentCut->size();
          assert(cutWidth <= MAX_CUT_SIZE);

          // ignore self-cut
          if (cutWidth == 1 && *(currentCut->begin()) == mapPoint) continue;

          // construct cut array
          DEBUG_PRINT("\tcut {");
          int p = 0;
          vector<ModRef> cutArray(cutWidth);
          for(set<ModRef>::iterator cut_it = currentCut->begin(); cut_it != currentCut->end(); cut_it++) {
            DEBUG_PRINTMORE(cut_it->ref << ", ");
            assert(p < cutWidth);
            cutArray[p++] = *cut_it;
          }
          DEBUG_PRINTMORE("} ");

          // compute cut function
          TableEntry cutFunction;
          simulate(target->getDesign(), cutFunction, mapPoint, cutArray);
          // compute complement of cut function
          TableEntry complementFunction(cutFunction);
          for(int bit=0; bit<(1<<cutWidth); bit++) {
            complementFunction.setBit(bit, !complementFunction.getBit(bit));
          }

          // match cut function
          for(vector<TableEntry>::iterator lib_it = tables[cutWidth].begin(); 
              lib_it != tables[cutWidth].end(); lib_it++) {
            TableEntry *currentChoice = &(*lib_it);
            assert(currentChoice);
            
            // with non-inverted function...
            if (currentChoice->match(cutFunction)) {
              // print name
              if (currentChoice->cell) {
                oa::oaScalarName gateName;
                currentChoice->cell->getCellName(gateName);
                oa::oaString gateString;
                gateName.get(gateString);
                DEBUG_PRINTMORE(" " << gateString);
              } else if (currentChoice->directFlag) {
                DEBUG_PRINTMORE(" WIRE");
              } else if (currentChoice->constantFlag) {
                DEBUG_PRINTMORE(" CONST");
              }
              // keep best match
              if (getCumulativeAreaCost(currentCut, currentChoice) < minCost_p) {
                minCost_p = getCumulativeAreaCost(currentCut, currentChoice);
                bestChoice_p = currentChoice;
                bestCut_p = currentCut;
              }
            }

            // with inverted function...
            if (currentChoice->match(complementFunction)) {
              // print name
              if (currentChoice->cell) {
                oa::oaScalarName gateName;
                currentChoice->cell->getCellName(gateName);
                oa::oaString gateString;
                gateName.get(gateString);
                DEBUG_PRINTMORE(" !" << gateString);
              } else if (currentChoice->directFlag) {
                DEBUG_PRINTMORE(" !WIRE");
              } else if (currentChoice->constantFlag) {
                DEBUG_PRINTMORE(" !CONST");
              }
              // keep best match
              if (getCumulativeDelayCost(currentCut, currentChoice) < minCost_n) {
                minCost_n = getCumulativeDelayCost(currentCut, currentChoice);
                bestChoice_n = currentChoice;
                bestCut_n = currentCut;
              }
            }
          }

          DEBUG_PRINTMORE(endl);
        }

        // lastly, check self-cuts (i.e. just using an inverter)
        if (minCost_p + notEntry->cost < minCost_n) {
          minCost_n = minCost_p + notEntry->cost;
          bestChoice_n = notEntry;
          bestCut_n = &selfCut;
          DEBUG_PRINTLN("\tself-cut {} !p");
        }
        if (minCost_n + notEntry->cost < minCost_p) {
          minCost_p = minCost_n + notEntry->cost;
          bestChoice_p = notEntry;
          bestCut_p = &selfCut;
          DEBUG_PRINTLN("\tself-cut {} !n");
        }

        // could we not find a mapping?
        if (!bestChoice_p || !bestChoice_n) {
          cerr << endl << "ERROR: Could not find a mapping for ";
          mapPoint.print();
          cerr << "       This shouldn't happen.  Perhaps the cut count/depth is too small?" << endl;
          QUIT_ON_ERROR;
        }

        DEBUG_PRINTLN("\tp-cost = " << minCost_p << " n-cost = " << minCost_n);
        cost_p[mapPoint] = minCost_p;
        cost_n[mapPoint] = minCost_n;
        assert(bestChoice_p);
        choice_p[mapPoint] = bestChoice_p;
        assert(bestChoice_n);
        choice_n[mapPoint] = bestChoice_n;
        assert(bestCut_p);
        cut_p[mapPoint] = *bestCut_p;
        assert(bestCut_n);
        cut_n[mapPoint] = *bestCut_n;
      }
    }

    // --- PHASE 2 : implement choices

    cout << endl << "\t(ii) implementing best global choices" << endl;
    implementAll(target);

    // PHASE 3 : clean up

    cout << "\t(iii) cleaning up" << endl;
    ModGraph::clearKfeasibleCuts(target);

    // make module structural
    oa::oaModNet *net;
    oa::oaIter<oa::oaModNet> netIter(target->getNets(oacNetIterSingleBit));
    while((net = netIter.getNext())) {
      ModRef termNode = ModGraph::getNetToAiConnection(toBitNet(net));
      if (ModGraph::isTerminal(termNode)) {
        ModGraph::detach(termNode);
      }
    }
    // delete manager if this is the only module
    if (target->getDesign()->getModules().getCount() == 1) {
      Manager::destroy(target->getDesign());
    }

    cout << "\tdelay cost = " << totalDelay << endl;
}

}
