Source code for xdev.algo

from __future__ import absolute_import, division, print_function, unicode_literals
import numpy as np
import decimal
from collections import defaultdict
import ubelt as ub



[docs]
def edit_distance(string1, string2):
    """
    Edit distance algorithm. String1 and string2 can be either
    strings or lists of strings

    Args:
        string1 (str | List[str]):
        string2 (str | List[str]):

    Requirements:
        pip install python-Levenshtein

    Returns:
        float | List[float] | List[List[float]]

    Example:
        >>> # xdoctest: +REQUIRES(module:Levenshtein)
        >>> string1 = 'hello world'
        >>> string2 = ['goodbye world', 'rofl', 'hello', 'world', 'lowo']
        >>> edit_distance(['hello', 'one'], ['goodbye', 'two'])
        >>> edit_distance('hello', ['goodbye', 'two'])
        >>> edit_distance(['hello', 'one'], 'goodbye')
        >>> edit_distance('hello', 'goodbye')
        >>> distmat = edit_distance(string1, string2)
        >>> result = ('distmat = %s' % (ub.repr2(distmat),))
        >>> print(result)
        >>> [7, 9, 6, 6, 7]
    """

    import Levenshtein
    isiter1 = ub.iterable(string1)
    isiter2 = ub.iterable(string2)
    strs1 = string1 if isiter1 else [string1]
    strs2 = string2 if isiter2 else [string2]
    distmat = [
        [Levenshtein.distance(str1, str2) for str2 in strs2]
        for str1 in strs1
    ]
    # broadcast
    if not isiter2:
        distmat = [row[0] for row in distmat]
    if not isiter1:
        distmat = distmat[0]
    return distmat




[docs]
def knapsack(items, maxweight, method='iterative'):
    r"""
    Solve the knapsack problem by finding the most valuable subsequence of
    `items` subject that weighs no more than `maxweight`.

    Args:
        items (tuple): is a sequence of tuples `(value, weight, id_)`, where
            `value` is a number and `weight` is a non-negative integer, and
            `id_` is an item identifier.

        maxweight (numbers.Real):
            a non-negative number indicating the maximum weight of items we can
            take.

    Returns:
        tuple: (total_value, items_subset) - a pair whose first element is the
            sum of values in the most valuable subsequence, and whose second
            element is the subsequence. Subset may be different depending on
            implementation (ie top-odwn recusrive vs bottom-up iterative)

    References:
        http://codereview.stackexchange.com/questions/20569/dynamic-programming-solution-to-knapsack-problem
        http://stackoverflow.com/questions/141779/solving-the-np-complete-problem-in-xkcd
        http://www.es.ele.tue.nl/education/5MC10/Solutions/knapsack.pdf

    Example:
        >>> # ENABLE_DOCTEST
        >>> import ubelt as ub
        >>> items = [(4, 12, 0), (2, 1, 1), (6, 4, 2), (1, 1, 3), (2, 2, 4)]
        >>> maxweight = 15
        >>> total_value1, items_subset1 = knapsack(items, maxweight, method='iterative')
        >>> print('items_subset1 = {}'.format(ub.repr2(items_subset1, nl=1)))
        >>> print('total_value1 = {}'.format(ub.repr2(total_value1, nl=1)))

    Example:
        >>> # ENABLE_DOCTEST
        >>> # Solve https://xkcd.com/287/
        >>> weights = [2.15, 2.75, 3.35, 3.55, 4.2, 5.8] * 2
        >>> items = [(w, w, i) for i, w in enumerate(weights)]
        >>> maxweight = 15.05
        >>> total_value1, items_subset1 = knapsack(items, maxweight, method='iterative')
        >>> total_weight = sum([t[1] for t in items_subset1])
        >>> print('total_weight = %r' % (total_weight,))
        >>> print('items_subset1 = %r' % (items_subset1,))
        >>> #assert items_subset1 == items_subset, 'NOT EQ\n%r !=\n%r' % (items_subset1, items_subset)

    Timeit:
        # >>> import ubelt as ub
        # >>> setup = ub.codeblock(
        # >>>     '''
        #         import ubelt as ut
        #         weights = [215, 275, 335, 355, 42, 58] * 40
        #         items = [(w, w, i) for i, w in enumerate(weights)]
        #         maxweight = 2505
        #         #import numba
        #         #knapsack_numba = numba.autojit(knapsack_iterative)
        #         #knapsack_numba = numba.autojit(knapsack_iterative_numpy)
        #         ''')
        # >>> # Test load time
        # >>> stmt_list1 = ub.codeblock(
        # >>>     '''
        #         knapsack_iterative(items, maxweight)
        #         knapsack_ilp(items, maxweight)
        #         #knapsack_numba(items, maxweight)
        #         #knapsack_iterative_numpy(items, maxweight)
        #         ''').split('\n')

        # ut.util_dev.timeit_compare(stmt_list1, setup, int(5))

        from xdev.algo import *  # NOQA
        def knapsack_inputs(n):
            rng = np.random
            items = [
                (rng.randint(0, 10), rng.randint(0, 10), idx) for idx in range(n)]
            return {'items':  items, 'maxweight': 100}

        def input_wrapper(func):
            import functools
            @functools.wraps(func)
            def _wrap(kwargs):
                return func(**kwargs)
            return _wrap

        import perfplot
        perfplot.show(
            setup=knapsack_inputs,
            kernels=list(map(input_wrapper, [knapsack_iterative, knapsack_ilp])),
            n_range=[2 ** k for k in range(15)],
            xlabel="maxweight",
            equality_check=None,
        )

    """
    if method == 'iterative':
        return knapsack_iterative(items, maxweight)
    elif method == 'ilp':
        return knapsack_ilp(items, maxweight)
    else:
        raise NotImplementedError('[util_alg] knapsack method=%r' % (method,))

        #return knapsack_iterative_numpy(items, maxweight)



[docs]
def knapsack_ilp(items, maxweight, verbose=False):
    """
    solves knapsack using an integer linear program

    Example:
        >>> # DISABLE_DOCTEST
        >>> import ubelt as ub
        >>> # Solve https://xkcd.com/287/
        >>> weights = [2.15, 2.75, 3.35, 3.55, 4.2, 5.8, 6.55]
        >>> values  = [2.15, 2.75, 3.35, 3.55, 4.2, 5.8, 6.55]
        >>> indices = ['mixed fruit', 'french fries', 'side salad',
        >>>            'hot wings', 'mozzarella sticks', 'sampler plate',
        >>>            'barbecue']
        >>> items = [(v, w, i) for v, w, i in zip(values, weights, indices)]
        >>> #items += [(3.95, 3.95, 'mystery plate')]
        >>> maxweight = 15.05
        >>> verbose = True
        >>> total_value, items_subset = knapsack_ilp(items, maxweight, verbose)
        >>> print('items_subset = %s' % (ub.repr2(items_subset, nl=1),))
    """
    import pulp
    # Given Input
    values  = [t[0] for t in items]
    weights = [t[1] for t in items]
    indices = [t[2] for t in items]
    # Formulate integer program
    prob = pulp.LpProblem("Knapsack", pulp.LpMaximize)
    # Solution variables
    x = pulp.LpVariable.dicts(name='x', indexs=indices,
                              lowBound=0, upBound=1, cat=pulp.LpInteger)
    # maximize objective function
    prob.objective = sum(v * x[i] for v, i in zip(values, indices))
    # subject to
    prob.add(sum(w * x[i] for w, i in zip(weights, indices)) <= maxweight)
    # Solve using with solver like CPLEX, GLPK, or SCIP.
    #pulp.CPLEX().solve(prob)
    pulp.PULP_CBC_CMD().solve(prob)
    # Read solution
    flags = [x[i].varValue for i in indices]
    total_value = sum([val for val, flag in zip(values, flags) if flag])
    items_subset = [item for item, flag in zip(items, flags) if flag]
    # Print summary
    if verbose:
        print(prob)
        print('OPT:')
        print('\n'.join(['    %s = %s' % (x[i].name, x[i].varValue) for i in indices]))
        print('total_value = %r' % (total_value,))
    return total_value, items_subset




[docs]
def number_of_decimals(num):
    r"""
    Args:
        num (float):

    References:
        stackoverflow.com/questions/6189956/finding-decimal-places

    Example:
        >>> # ENABLE_DOCTEST
        >>> num = 15.05
        >>> result = number_of_decimals(num)
        >>> print(result)
        2
    """
    exp = decimal.Decimal(str(num)).as_tuple().exponent
    return max(0, -exp)




[docs]
def knapsack_iterative(items, maxweight):
    # Knapsack requires integral weights
    weights = [t[1] for t in items]
    max_exp = max([number_of_decimals(w_) for w_ in weights])
    coeff = 10 ** max_exp
    # Adjust weights to be integral
    int_maxweight = int(maxweight * coeff)
    int_items = [(v, int(w * coeff), idx) for v, w, idx in items]
    """
    items = int_items
    maxweight = int_maxweight
    """
    return knapsack_iterative_int(int_items, int_maxweight)




[docs]
def knapsack_iterative_int(items, maxweight):
    r"""
    Iterative knapsack method

    Math:
        maximize \sum_{i \in T} v_i
        subject to \sum_{i \in T} w_i \leq W

    Notes:
        dpmat is the dynamic programming memoization matrix.
        dpmat[i, w] is the total value of the items with weight at most W
        T is idx_subset, the set of indicies in the optimal solution

    Example:
        >>> # ENABLE_DOCTEST
        >>> weights = [1, 3, 3, 5, 2, 1] * 2
        >>> items = [(w, w, i) for i, w in enumerate(weights)]
        >>> maxweight = 10
        >>> items = [(.8, 700, 0)]
        >>> maxweight = 2000
        >>> print('maxweight = %r' % (maxweight,))
        >>> print('items = %r' % (items,))
        >>> total_value, items_subset = knapsack_iterative_int(items, maxweight)
        >>> total_weight = sum([t[1] for t in items_subset])
        >>> print('total_weight = %r' % (total_weight,))
        >>> print('items_subset = %r' % (items_subset,))
        >>> result =  'total_value = %.2f' % (total_value,)
        >>> print(result)
        total_value = 0.80

    Ignore:
        DPMAT = [[dpmat[r][c] for c in range(maxweight)] for r in range(len(items))]
        KMAT  = [[kmat[r][c] for c in range(maxweight)] for r in range(len(items))]
    """
    values  = [t[0] for t in items]
    weights = [t[1] for t in items]
    maxsize = maxweight + 1
    # Sparse representation seems better
    dpmat = defaultdict(lambda: defaultdict(lambda: np.inf))
    kmat = defaultdict(lambda: defaultdict(lambda: False))
    idx_subset = []  # NOQA
    for w in range(maxsize):
        dpmat[0][w] = 0
    # For each item consider to include it or not
    for idx in range(len(items)):
        item_val = values[idx]
        item_weight = weights[idx]
        # consider at each possible bag size
        for w in range(maxsize):
            valid_item = item_weight <= w
            if idx > 0:
                prev_val = dpmat[idx - 1][w]
                prev_noitem_val = dpmat[idx - 1][w - item_weight]
            else:
                prev_val = 0
                prev_noitem_val = 0
            withitem_val = item_val + prev_noitem_val
            more_valuable = withitem_val > prev_val
            if valid_item and more_valuable:
                dpmat[idx][w] = withitem_val
                kmat[idx][w] = True
            else:
                dpmat[idx][w] = prev_val
                kmat[idx][w] = False
    # Trace backwards to get the items used in the solution
    K = maxweight
    for idx in reversed(range(len(items))):
        if kmat[idx][K]:
            idx_subset.append(idx)
            K = K - weights[idx]
    idx_subset = sorted(idx_subset)
    items_subset = [items[i] for i in idx_subset]
    total_value = dpmat[len(items) - 1][maxweight]
    return total_value, items_subset




[docs]
def knapsack_iterative_numpy(items, maxweight):
    r"""
    Iterative knapsack method

    maximize \sum_{i \in T} v_i
    subject to \sum_{i \in T} w_i \leq W

    Notes:
        dpmat is the dynamic programming memoization matrix.
        dpmat[i, w] is the total value of the items with weight at most W
        T is the set of indicies in the optimal solution
    """
    #import numpy as np
    items = np.array(items)
    weights = items.T[1]
    # Find maximum decimal place (this problem is in NP)
    max_exp = max([number_of_decimals(w_) for w_ in weights])
    coeff = 10 ** max_exp
    # Adjust weights to be integral
    weights = (weights * coeff).astype(np.int)
    values  = items.T[0]
    MAXWEIGHT = int(maxweight * coeff)
    W_SIZE = MAXWEIGHT + 1

    dpmat = np.full((len(items), W_SIZE), np.inf)
    kmat = np.full((len(items), W_SIZE), 0, dtype=np.bool)
    idx_subset = []

    for w in range(W_SIZE):
        dpmat[0][w] = 0
    for idx in range(1, len(items)):
        item_val = values[idx]
        item_weight = weights[idx]
        for w in range(W_SIZE):
            valid_item = item_weight <= w
            prev_val = dpmat[idx - 1][w]
            if valid_item:
                prev_noitem_val = dpmat[idx - 1][w - item_weight]
                withitem_val = item_val + prev_noitem_val
                more_valuable = withitem_val > prev_val
            else:
                more_valuable = False
            dpmat[idx][w] = withitem_val if more_valuable else prev_val
            kmat[idx][w] = more_valuable
    K = MAXWEIGHT
    for idx in reversed(range(1, len(items))):
        if kmat[idx, K]:
            idx_subset.append(idx)
            K = K - weights[idx]
    idx_subset = sorted(idx_subset)
    items_subset = [items[i] for i in idx_subset]
    total_value = dpmat[len(items) - 1][MAXWEIGHT]
    return total_value, items_subset



#def knapsack_all_solns(items, maxweight):
#    """
#    TODO: return all optimal solutions to the knapsack problem

#    References:
#        stackoverflow.com/questions/30554290/all-solutions-from-knapsack-dp-matrix

#    >>> items = [(1, 2, 0), (1, 3, 1), (1, 4, 2), (1, 3, 3), (1, 3, 4), (1, 5, 5), (1, 4, 6), (1, 1, 7), (1, 1, 8), (1, 3, 9)]
#    >>> weights = ut.get_list_column(items, 1)
#    >>> maxweight = 6
#    """



[docs]
def knapsack_greedy(items, maxweight):
    r"""
    non-optimal greedy version of knapsack algorithm
    does not sort input. Sort the input by largest value
    first if desired.

    Args:
        `items` (tuple): is a sequence of tuples `(value, weight, id_)`, where `value`
            is a scalar and `weight` is a non-negative integer, and `id_` is an
            item identifier.

        `maxweight` (scalar):  is a non-negative integer.

    Example:
        >>> # ENABLE_DOCTEST
        >>> items = [(4, 12, 0), (2, 1, 1), (6, 4, 2), (1, 1, 3), (2, 2, 4)]
        >>> maxweight = 15
        >>> total_value, items_subset = knapsack_greedy(items, maxweight)
        >>> result =  'total_value = %r\n' % (total_value,)
        >>> result += 'items_subset = %r' % (items_subset,)
        >>> print(result)
        total_value = 7
        items_subset = [(4, 12, 0), (2, 1, 1), (1, 1, 3)]
    """
    items_subset = []
    total_weight = 0
    total_value = 0
    for item in items:
        value, weight = item[0:2]
        if total_weight + weight > maxweight:
            continue
        else:
            items_subset.append(item)
            total_weight += weight
            total_value += value
    return total_value, items_subset



if __name__ == '__main__':
    """
    CommandLine:
        python ~/code/xdev/xdev/algo.py all
    """
    import xdoctest
    xdoctest.doctest_module(__file__)