Programming Forums
User Name Password Register
 

RSS Feed
FORUM INDEX | TODAY'S POSTS | UNANSWERED THREADS | ADVANCED SEARCH

Reply
 
Thread Tools Display Modes
Old Jun 11th, 2005, 11:32 PM   #1
EverLearning
Hobbyist Programmer
 
EverLearning's Avatar
 
Join Date: May 2005
Location: Indiana
Posts: 130
Rep Power: 4 EverLearning is on a distinguished road
interpolation search on a million

Quote:
Originally Posted by DaWei
if you'd like to start another thread regarding the interpolative search, I have a function that works. I want to add to it so it compares binary vs interpolative for various dataset distributions. We should dedicate this one to fcp's game/original question.
Agree, don't want to confuse the OP :o

"results" in trying to get this search to work for 1,000,000:

1. my compiler flipped out flag errors on me
2. changing relevant variables to long ints did not work for me: the formula for estimate still would give a large negative number (actually I had a small data-type-math-test program)
3. changing relevant variables/casting to doubles (and everything else that goes with it: floor and fmod) resolved negative values in the formula and got it to work!

DaWei, how did you go about it?

This is actually unclear:
Quote:
Originally Posted by DaWei
If you're on a 32-bit machine a signed integer should handle a couple billion. RAND_MAX is 32767. By multiplying two of those you can get into the 32-bit range, as Dragon showed.
rand()*rand() gives a huge number well over 32768, and division of a number < 32768 produces some negative rubbish...

to Ancient Dragon: thanks for pointing out errors, so here it is ... run it and tell me that it's not working and i'll shred my class-notes, hmm... i guess they may have a point not wanting to hire people straight out of school.

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>

void random_vector (int n, double m, double a[n])
{
   int i = 0;
   for(i = 0; i < n; i++)
   { 
     double random = rand();
     a[i] = floor(fmod(random, m));
   
   }
}

void print_vector (int n, double a[n])
{
   int i;
   for(i = 0; i < n; i++) 
	   printf(" %g", a[i]);
   printf("last is %g\n", a[n-1]);
   printf("\n");
}

int compare (const void *e1, const void *e2)
{
   double *i1 = (double*)e1;
   double *i2 = (double*)e2;
   return ((*i1 < *i2) ? -1 : (*i1 > *i2) ? +1 : 0);
}

int search (double a[], int lb, int ub, int key, int *num_tries)
{
   if (lb > ub)
   {
      return -1;
   }
   else if (lb == ub)
   {
      return ( a[lb] == key ? lb : -1 );
   }
   else
   {
      double z = (double)lb + (double)(ub-lb)*((double)key-a[lb])/(a[ub]-a[lb]);
      printf("z is %g\n", z);
      int k = (int)floor(z);

      printf("estimate now is %d\n", k);
      
      if ((k < lb) || (k > ub))
      { 
          return -1;
      }
      if (a[k] == key)
      {
         return k;
      }
      else if (a[k] > key)
      {
         *num_tries = *num_tries + 1;
         return search(a, lb, k-1, key, num_tries);
      }
      else
      {
         *num_tries = *num_tries + 1;
         return search(a, k+1, ub, key, num_tries);
      }
   }
}  


void test_search (int n, double a[n])
{
   int answer = 0;
   do
   {
      int num_tries = 0;
      int key = 0;
      int k = 0;
      printf("Give a key : "); 
      scanf("%d", &key);
      k = search(a, 0, n-1, key, &num_tries);
     
      if (k < 0)
      {
         printf("not found.\n");
      }
      else
      {
         if(num_tries == 0)
		   num_tries++;
         printf("found at position %d\n", k);
         printf("tries = %d\n", num_tries);
      }
      printf("Do you want more tests ? (1 for yes, 0 for no) ");
      scanf("%d", &answer);
   } while (answer == 1);
}

int main (void)
{
   int s = 0;
   int m = 0;

   srand(time(NULL));

   printf("Give size of array to sort: ");
   scanf("%d", &s);
   printf("Give range of numbers in the array: ");
   scanf("%d", &m);

   double a[s];
   random_vector(s, (double)m, a);
   //print_vector(s, a);
   qsort((void*)a,(size_t)s, sizeof(double), compare);
   printf("\n");
   printf("********SORTED*********\n");
   //print_vector(s, a);
   printf("last is %g\n", a[s-1]);
   test_search(s, a);

   return 0;
}
__________________
got math? yumm...

"All men by nature desire to know" -Aristotle, Metaphysics

Last edited by EverLearning; Jun 12th, 2005 at 6:37 PM. Reason: forgot one cast; correcting error mentioned in post #3
EverLearning is offline   Reply With Quote
Old Jun 12th, 2005, 8:06 AM   #2
Ancient Dragon
PFO God In Training
 
Ancient Dragon's Avatar
 
Join Date: Jun 2005
Location: near St Louis, MO. (USA)
Posts: 499
Rep Power: 4 Ancient Dragon is on a distinguished road
That seemed to work for all the values I could throw at it. using rand() by itself doesn't seem to produce large enough values, so I changed it to rand() * rand(), as in my previous tests.

I think you have the num_tries counter in the wrong place -- it doesn't count the number of tries correctly. In the code below I also typecast the array a to int when making the comparisons to insure integer test rather than double test.

      if ((k < lb) || (k > ub))
      { 
          return -1;
      }
      (*num_tries)++;
      if ((int)(a[k]) == key)
      {
         return k;
      }
      else if ((int)(a[k]) > key)
      {
//         *num_tries = *num_tries + 1;
         return search(a, lb, k-1, key, num_tries);
      }
      else
      {
//         *num_tries = *num_tries + 1;
         return search(a, k+1, ub, key, num_tries);
      }

Last edited by Ancient Dragon; Jun 12th, 2005 at 8:09 AM.
Ancient Dragon is offline   Reply With Quote
Old Jun 12th, 2005, 3:24 PM   #3
EverLearning
Hobbyist Programmer
 
EverLearning's Avatar
 
Join Date: May 2005
Location: Indiana
Posts: 130
Rep Power: 4 EverLearning is on a distinguished road
Yeah... if i would've corrected in posted code this line, which did not make any sense now that i look at it:
      else
      {
         if(num_tries == 0) //not (k == 0) 
		   num_tries++;
         printf("found at position %d\n", k);
         printf("tries = %d\n", num_tries);
      }
it would've been same thing, but it's a round-about way of doing it. My bet ... missed it.

Compiling with -Wall did not detect the incompatible types comparison error, but you're right again.
__________________
got math? yumm...

"All men by nature desire to know" -Aristotle, Metaphysics

Last edited by EverLearning; Jun 12th, 2005 at 3:35 PM.
EverLearning is offline   Reply With Quote
Old Jun 12th, 2005, 5:45 PM   #4
Ancient Dragon
PFO God In Training
 
Ancient Dragon's Avatar
 
Join Date: Jun 2005
Location: near St Louis, MO. (USA)
Posts: 499
Rep Power: 4 Ancient Dragon is on a distinguished road
I believe the original statement that started all this was that iterative search was faster than binary search. For doubles, I agree. But for other data types the iterative search algorithm you have shown so far fails to work at all, let alone faster. DaWel said he has one that works, and I'd like to see it. But so far, binary search raigns supreme.
Ancient Dragon is offline   Reply With Quote
Old Jun 12th, 2005, 6:49 PM   #5
EverLearning
Hobbyist Programmer
 
EverLearning's Avatar
 
Join Date: May 2005
Location: Indiana
Posts: 130
Rep Power: 4 EverLearning is on a distinguished road
This is a recursive version so I'm not sure it's necessarily faster computation-wise, even if it is non-recursive it is not all that much faster than BS, but in terms of number of comparisons it supposed to have fewer # of comparisons than binary given data is uniformely distributed and large, it supposed to approximate to a straight line if graphed in size vs. array[size] plane.

It had fewer # of tries than BS on ints on 5500 range.
My implementation may suck, but sorry, I did not invent the whole idea of this search and do not find it erroneous given proper conditions.
I'll test it out for other data-types, it should not be necessarily dependent on the data-type but how it's distributed. It's worst case is O(n) comparisons, which is roughly the size of data.

I also would like to see DaWei's.
__________________
got math? yumm...

"All men by nature desire to know" -Aristotle, Metaphysics
EverLearning is offline   Reply With Quote
Old Jun 12th, 2005, 7:20 PM   #6
DaWei
Resident Grouch
 
DaWei's Avatar
 
Join Date: Jun 2005
Posts: 6,453
Rep Power: 10 DaWei is on a distinguished road
If one has a data set that can be described (or closely approximated) by a function, then an interpolative search using that function should be very effective, since the initial "guess" zeroes in rapidly regardless of where the "needle" is in the "haystack." A binary search repeatedly tosses away half the set on each iteration, which means if the "needle" is near the extremes, the search will take longer than if it's near the center (although it's still very effective).

This rough little program deals with a linear data set. The same gains should hold true for any data set which can be approximated by a function, e.g. exponential, or whatever.

There are three functions available for filling the set. They are not operator selectable, though that could easily be added. I just alter the source code and go. One function, "linearFill," results in an essentially perfect linearity. A second function, "perturbedFill," perturbs the perfect value by a random amount. The amount of the variance is such that the value for any given element will not "encroach" on the range of values for either neighboring element. The third function, "randomFill," fills the vector with random numbers in the range zero to maxValue, then sorts the vector.

The maximum value is currently set to one million. The size of the vector is set to 70 elements strictly for plotting purposes. After the vector is filled the numerical values are displayed, then plotted. The plot is 70 units wide. The range of values is scaled to fit in 25 lines.

I can imagine circumstance where something like measurement samples would result in a more or less predictable data set. I can't imagine that one would search the set for a precise value. More likely, one would search for a value that fell within some tolerance range. Consequently, I have rewritten my first shot to allow a tolerance to be specified for the "needle."

After the set values are displayed and plotted the user is asked for a "needle" value and a tolerance. An interpolative search is made, followed by a binary search. The results are displayed, including the number of tries required to either find the value (within the tolerance spec) or determine that it is not within the set.

No time was spent in minimizing the code or in organizing it into separate source/header files. I have played with it a small amount. It behaves as I expect. It might be riddled with gotchas and glitches. They would have surfaced by now, but it's waiting for my boss to come look over my shoulder before it crashes. I have been concealing from my system the fact that I no longer have a boss, so it doesn't know not to wait for an opportune time.
#include <iostream>
#include <vector>
#include <algorithm>
#include <iomanip>
#include <string>
#include <cmath>
#include <ctime>

using namespace std;

typedef vector <int> dataSet;
const unsigned setSize = 70;
const unsigned maxValue = 1000000;

int  compare (int a, int b, int tol);
void showData (dataSet& set, unsigned size);
void plotData (dataSet& set, unsigned size);
void showResults (int idx, unsigned trials);
void linearFill (dataSet& set, unsigned max, unsigned size);
void perturbedFill (dataSet& set, unsigned max, unsigned size);
void randomFill (dataSet& set, unsigned max, unsigned size);
unsigned interpSearch (dataSet& theSet, unsigned low, unsigned high, int theVal, int tol, unsigned *tries);
unsigned binarySearch (dataSet& theSet, unsigned low, unsigned high, int theVal, int tol, unsigned *tries);

int uhOh (string badNews)
{
    cerr << badNews << endl;
    return 1;
}
int main (int argc, char *argv [])
{
    dataSet theSet;
    unsigned tries;
    int theVal, index, tolerance;
    unsigned maxRand = (unsigned) sqrt ((double) maxValue);
    srand ((unsigned) time (0));
    while (true)
    {
        // Generate dataset
        randomFill (theSet, maxValue, setSize);
        showData (theSet, setSize);
        plotData (theSet, setSize);

        cout << "Enter a positive integer in the range 0-" << (maxValue-1) << ": ";
        cin.sync ();
        cin >> theVal;
        if ((theVal < 0) || (theVal >= maxValue) || !cin.good ())
            return uhOh ("Aborting, pay attention next time");
        cout << "Enter a tolerance value in the range 0-" << (maxValue/setSize) << ": ";
        cin.sync ();
        cin >> tolerance;
        if ((tolerance < 0) || (tolerance > (maxValue/setSize)) || !cin.good ())
            return uhOh ("Aborting, invalid input");
        tries = 0;
        index = interpSearch (theSet, 0, setSize-1, theVal, tolerance, &tries);
        cout << "Interpolative search: ";
        showResults (index, tries);
        tries = 0;
        index = binarySearch (theSet, 0, setSize-1, theVal, tolerance, &tries);
        cout << "Binary search: ";
        showResults (index, tries);
        cout << "Press ENTER for next iteration\n";
        cin.sync ();
        cin.get ();
    }
	return 0;
}
unsigned interpSearch (dataSet& theSet, unsigned low, unsigned high, 
                       int theVal, int tol, unsigned *pTries)
{
    unsigned divs, theSlot;

    (*pTries)++;
    if ((theVal < (theSet [low] - tol)) || (theVal > (theSet [high] + tol))) return -1;

    if (low >= high)
    {
        if (compare (theSet [low], theVal, tol) == 0) return low;
        else return -1;
    }
    divs = high-low;
    theSlot = (theVal - theSet [low]) * divs / (theSet [high] - theSet [low]) + low;

    if (compare (theSet [theSlot], theVal, tol) == 0) return theSlot;
    else if (compare (theSet [theSlot], theVal, tol) > 0) 
            return interpSearch (theSet, low, theSlot-1, theVal, tol, pTries);
    else    return interpSearch (theSet, theSlot+1, high, theVal, tol, pTries);
}
unsigned binarySearch (dataSet& theSet, unsigned low, unsigned high, 
                       int theVal, int tol, unsigned *pTries)
{
    unsigned theSlot;
    (*pTries)++;
    if ((compare (theVal, theSet [low],  tol) < 0) || 
        (compare (theVal, theSet [high], tol) > 0)) return -1;

    if (low >= high)
    {
        if (compare (theSet [low], theVal, tol) == 0) return low;
        else return -1;
    }
    theSlot = (high-low)/2 + low;
    if (compare (theSet [theSlot], theVal, tol) == 0) return theSlot;
    else if (compare (theSet [theSlot], theVal, tol) > 0) 
            return binarySearch (theSet, low, theSlot-1, theVal, tol, pTries);
    else    return binarySearch (theSet, theSlot+1, high, theVal, tol, pTries);
}
void showData (dataSet& set, unsigned size)
{    
    cout << "The set:\n";
    unsigned nLines = size / 10;
    if (size % 10) nLines++;
    for (unsigned i = 0; i < nLines; ++i)
    {
        for (unsigned j = 0; j < 10; ++j)
        {
            cout << setw (6) << set [i*10+j] << ' ';
        }
        cout << endl;
    }
    cout << endl;
}
void plotData (dataSet& set, unsigned size)
{
    int min = set [0];
    int max = set [size-1];
    int incr = max / 25;
    for (int i = 24; i >= 0; --i)
    {
        cout << '|';
        for (int j = 0; j < 70; ++j)
        {
            if ((set [j] < i*incr) && (set [j] > i*incr-incr)) cout << 'x'; else cout << (char) ' ';
        }
        cout << endl;
    }
    cout << "|----------------------------------------------------------------------";
    cout << endl;
}
void showResults (int idx, unsigned trials)
{
    string sTries = " tries";
    if (trials == 1) sTries = " try";
    if (idx >= 0 ) cout << "Found in element " << idx
                          << " with " << trials;
    else cout << "Determined not to be in set after " << trials;
    cout << sTries << endl;
}
void linearFill (dataSet& set, unsigned max, unsigned size)
{
    unsigned incr = max / (size-1);
    for (unsigned i = 0; i < size; ++i) set.push_back (i * incr);
}
void perturbedFill (dataSet& set, unsigned max, unsigned size)
{
    unsigned incr = max / (size-1);
    for (unsigned i = 0; i < size; ++i) set.push_back (i * incr + (rand () % incr));
}
void randomFill (dataSet& set, unsigned max, unsigned size)
{
    for (unsigned i = 0; i < size; ++i) set.push_back ((rand () * rand ()) % max);
    sort (set.begin (), set.end ());
}
int  compare (int a, int b, int tol)
{
    if ((a - b + tol) < 0) return -1;
    if ((a - b - tol) > 0) return 1;
    return 0;
}
__________________
Abstraction doesn't make it impossible to write bad code; it makes it possible to write superior code.
Contributor's Corner: Grumpy on C++ Exceptions DaWei on Pointers
DaWei is offline   Reply With Quote
Old Jun 13th, 2005, 7:40 AM   #7
nnxion
Programming Guru
 
nnxion's Avatar
 
Join Date: Jun 2005
Location: elemental plane
Posts: 1,429
Rep Power: 5 nnxion is on a distinguished road
Oo oo that's not C, you trying to confuse me?
Hehe just kidding. What does perturb mean? The dictionary gives:
Quote:
Originally Posted by dictionary.com
1. To disturb greatly; make uneasy or anxious.
2. To throw into great confusion.
3. Physics & Astronomy. To cause perturbation, as of a celestial orbit.
So as to randomize the data set?
__________________
"Employ your time in improving yourself by other men's writings, so that you shall gain easily what others have labored hard for."
-- Socrates
nnxion is offline   Reply With Quote
Old Jun 13th, 2005, 8:15 AM   #8
mitakeet
Programmer
 
mitakeet's Avatar
 
Join Date: Jun 2005
Location: Maryland, USA
Posts: 59
Rep Power: 4 mitakeet is on a distinguished road
Perturb generally means to give slight randomization. For instance, if a valid value is in the range of 10-12 and you want to test values slightly outside of the valid range, you might 'perturb' the values to the range of 9-13.
__________________

Free code: http://sol-biotech.com/code/.

It is not that old programmers are any smarter or code better, it is just that they have made the same stupid mistake so many times that it is second nature to fix it.
--Mitakeet

The reasonable man adapts himself to the world; the unreasonable one persists in trying to adapt the world to himself. Therefore, all progress depends on the unreasonable man.
--George Bernard Shaw
mitakeet is offline   Reply With Quote
Old Jun 13th, 2005, 9:59 AM   #9
DaWei
Resident Grouch
 
DaWei's Avatar
 
Join Date: Jun 2005
Posts: 6,453
Rep Power: 10 DaWei is on a distinguished road
Oh, lawdy. I forgot the question arose in the C forum, not the C++. Oh well, the subject is really related to a concept/algorithm, not a language. Just wash, bleach, rinse, and iron lightly. As for "perturb," what Mitakeet said.
__________________
Abstraction doesn't make it impossible to write bad code; it makes it possible to write superior code.
Contributor's Corner: Grumpy on C++ Exceptions DaWei on Pointers
DaWei is offline   Reply With Quote
Old Jun 14th, 2005, 12:19 PM   #10
Ancient Dragon
PFO God In Training
 
Ancient Dragon's Avatar
 
Join Date: Jun 2005
Location: near St Louis, MO. (USA)
Posts: 499
Rep Power: 4 Ancient Dragon is on a distinguished road
The interpolation algorithms seem to work quite well when with linear data sets, but throw a non-linear data set at it an it is considerably slower than binary search (see changes below). this produces a set of only 30 numbers, and as you can see from the results below it took 4 times the number of attempts to find (or not find) the number.

unsigned setSize = 70;
unsigned maxValue = 1000000000;
...
...
int main (int argc, char *argv [])
{
    dataSet theSet;
    unsigned tries;
    int theVal, index, tolerance;
    unsigned maxRand = (unsigned) sqrt ((double) maxValue);
    srand ((unsigned) time (0));
    while (true)
    {
        // Generate dataset
        randomFill (theSet, maxValue, setSize);
	setSize = theSet.size();
        showData (theSet, setSize);

...
...
void randomFill (dataSet& set, unsigned max, unsigned size)
{
	unsigned val = maxValue;
	set.erase(set.begin(),set.end());
	for(unsigned i = 0; i < size && val > 0; i++)
	{
		set.push_back (val);
		val /= 2;
	}

//    for (unsigned i = 0; i < size; ++i) set.push_back ((rand () * rand ()) % max);
    sort (set.begin (), set.end ());
}

Enter a positive integer in the range 0-999999999: 1562500
Enter a tolerance value in the range 0-33333333: 3000
Interpolative search: Determined not to be in set after 21 tries
Binary search: Determined not to be in set after 5 tries
Press ENTER for next iteration

Enter a positive integer in the range 0-999999999: 15625000
Enter a tolerance value in the range 0-33333333: 3000
Interpolative search: Found in element 23 with 24 tries
Binary search: Found in element 23 with 5 tries
Press ENTER for next iteration

Last edited by Ancient Dragon; Jun 14th, 2005 at 12:31 PM.
Ancient Dragon is offline   Reply With Quote
Reply

Bookmarks

« Previous Thread in Forum | Next Thread in Forum »

Currently Active Users Viewing This Thread: 1 (0 members and 1 guests)
 
Thread Tools
Display Modes

Posting Rules
You may not post new threads
You may not post replies
You may not post attachments
You may not edit your posts

BB code is On
Smilies are On
[IMG] code is On
HTML code is Off
Forum Jump




DaniWeb IT Discussion Community
All times are GMT -5. The time now is 5:47 PM.

Powered by vBulletin® Version 3.7.0, Copyright ©2000 - 2008, Jelsoft Enterprises Ltd.
Copyright ©2007 DaniWeb® LLC