为什么在回调函数指针时使用 EXC_BAD_ACCESS (code=EXC_I386_GPFLT)?

问题描述

以下代码尝试使用 hashsetvector 计算文档中的词频。

#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>


/** ==================================== VECTOR ======================================= */

typedef enum {
    true,false
} bool;

typedef int (*VectorCompareFunction)(const void *elemAddr1,const void *elemAddr2);
typedef void (*VectorFreeFunction)(void *elemAddr);

typedef struct {
    int elemSize;               //how many byte for each element
    int elemNum;                //number of current element in vector
    int capacity;               //maximum number of element vector can hold
    void *elems;                //pointer to data memory
    VectorFreeFunction freefn;  //pointer to the function used to free each element
} vector;

/**
 * Reallocate a new memory of twice of original size
 * return 1 if reallocation success,otherwise return -1.
 */
static void DoubleMemory(vector *v) {
    void *tmp = realloc(v->elems,v->capacity * v->elemSize * 2);
    assert(tmp != NULL);
    v->elems = tmp;
    v->capacity *= 2;
}

/**
 * Constructor
 */
void VectorNew(vector *v,int elemSize,VectorFreeFunction freefn,int initialAllocation) {
    v->elems = malloc(initialAllocation * elemSize);
    assert(v->elems != NULL);
    v->elemSize = elemSize;
    v->elemNum = 0;
    v->capacity = initialAllocation;
    v->freefn = freefn;
}

/**
 * Appends a new element to the end of the specified vector.
 */
void VectorAppend(vector *v,const void *elemAddr) {
    /* double size if neccessary */
    if (v->elemNum == v->capacity) DoubleMemory(v);
    memcpy((char *)v->elems + v->elemNum * v->elemSize,elemAddr,v->elemSize);
    v->elemNum++;
}

/**
 * Search the specified vector for an element whose contents match the element passed as the key.
 */
int VectorSearch(const vector *v,const void *key,VectorCompareFunction searchfn,int startIndex,bool isSorted) {
    assert(key && searchfn);
    if (v->elemNum == 0) return -1;
    assert(startIndex >= 0 && startIndex < v->elemNum);
    if (isSorted == true) {
        /* binary search */
        void *startAddr = (char *)v->elems + startIndex * v->elemSize;
        int size = v->elemNum - startIndex;
        void *resAddr = bsearch(key,startAddr,size,v->elemSize,searchfn);
        return (resAddr != NULL)? ((char *)resAddr - (char *)v->elems) / v->elemSize : -1;
    } else {
        /* linear search */
        for (int i = 0; i < v->elemNum; i++) {
            if (searchfn((char *)v->elems + i * v->elemSize,key) == 0) {
                return i;
            }
        }
        return -1;
    }
}

/**
 * Overwrites the element at the specified position.
 */
void VectorReplace(vector *v,const void *elemAddr,int position) {
    assert(position >= 0 && position < v->elemNum);
    void *posAddr = (char *)v->elems + position * v->elemSize;
    /* free the memory of old element first */
    if (v->freefn != NULL) v->freefn(posAddr);
    memcpy(posAddr,v->elemSize);
}





/** ==================================== HASHSET ======================================= */

typedef int (*HashSetHashFunction)(const void *elemAddr,int numBuckets);
typedef int (*HashSetCompareFunction)(const void *elemAddr1,const void *elemAddr2);
typedef void (*HashSetFreeFunction)(void *elemAddr);

typedef struct {
    int elemNum;            //current element number
    int bucketNum;          //number of hash bucket
    int elemSize;           //how many byte each element has
    vector *buckets;        //array of vector
    HashSetHashFunction hashfn;
    HashSetCompareFunction compfn;
    HashSetFreeFunction freefn;
} hashset;

void HashSetNew(hashset *h,int numBuckets,HashSetHashFunction hashfn,HashSetCompareFunction comparefn,HashSetFreeFunction freefn) {
    assert(elemSize > 0 && numBuckets > 0 && hashfn != NULL && comparefn != NULL);
    h->buckets = (vector *)malloc(numBuckets * sizeof(vector));
    assert(h->buckets != NULL);
    for (int i = 0; i < numBuckets; i++) {
        vector *bucket = (vector *)((char *)h->buckets + i * sizeof(vector));
        VectorNew(bucket,elemSize,freefn,4);
    }
    h->bucketNum = numBuckets;
    h->elemSize = elemSize;
    h->elemNum = 0;
    h->hashfn = hashfn;
    h->compfn = comparefn;
    h->freefn = freefn;
}

void HashSetEnter(hashset *h,const void *elemAddr) {
    int hash = h->hashfn(elemAddr,h->bucketNum);
    vector *bucket = (vector *)((char *)h->buckets + hash * sizeof(vector));
    // search in the hash set first
    int pos = VectorSearch(bucket,h->compfn,false);
    if (pos != -1) {
        // replace the old one if find a match
        VectorReplace(bucket,pos);
    } else {
        // otherwise insert the new one
        VectorAppend(bucket,elemAddr);
        h->elemNum++;
    }
}




/** ==================================== DOC_FREQ & WORD_INDEX ======================================= */

/****************************************************************
 *
 * doc_freq is a key-value pair of [documentid,frequency]
 * It's not supposed to be exposed to user or search engine.
 * -----------------------------------------------------------
 * It looks like:
 *      [1611742826915764000]   [4      ]  
 *      |-------------------|   |-------|
 *       docid                   freq
 ***************************************************************/
typedef struct {
    long docid;
    int freq;
} doc_freq;

static void new_docfreq(doc_freq *df,long docid,int freq) {
    df->docid = docid;
    df->freq = freq;
}

/**
 * HashSetHashFunction<doc_freq>
 */
static int hash_docfreq(const void *elemAddr,int numBuckets) {
    doc_freq *df = (doc_freq *)elemAddr;
    return (int)(df->docid % numBuckets);
}

/**
 * HashSetCompareFunction<doc_freq>
 */
static int comp_docfreq(const void *elemAddr1,const void *elemAddr2) {
    long id1 = ((doc_freq *)elemAddr1)->docid;
    long id2 = ((doc_freq *)elemAddr2)->docid;
    if (id1 < id2) {
        return -1;
    } else if (id1 > id2) {
        return 1;
    } else { // id1 == id2
        return 0;
    }
}

/**
 * word_index is a index of a single word.
 * ---------------------------------------
 * A typical word_index looks like:
 *      [apple]: [doc1,5],[doc3,10],[doc5,7]
 *      |-----|  |------------------------------|
 *       word     freqs
 */
typedef struct {
    char *word;
    hashset *freqs; // hashset<doc_freq>
} word_index;

static const size_t kWordindexHashSetBuckets = 64;
static void new_wordindex(word_index *wi,const char *word) {
    hashset h;
    HashSetNew(&h,sizeof(doc_freq),kWordindexHashSetBuckets,hash_docfreq,comp_docfreq,NULL);
    wi->freqs = &h;
    size_t wordlen = strlen(word);
    wi->word = (char *)malloc(wordlen + 1); // +1 for null-termination
    strcpy(wi->word,word);
    (wi->word)[wordlen] = '\0';
}

/**
 * Mainly used to build a word_index.
 */
void add_docfreq(word_index *wi,const long docid,const int frequency) {
    doc_freq df;
    new_docfreq(&df,docid,frequency);
    HashSetEnter(wi->freqs,&df);
}





/** ==================================== UNIT-TEST ======================================= */

int main(void) {
    /* apple:   [1611742826915764000,5][1611742826915538000,10] */
    word_index *apple = (word_index *)malloc(sizeof(word_index));
    new_wordindex(apple,"apple");
    add_docfreq(apple,1611742826915764000L,5);
    add_docfreq(apple,1611742826915538000L,10);
}

它给了我一个segmentation fault

[1]    84309 segmentation fault  testindexer

lldb 发现问题发生在 hashset 尝试回调函数 hashfn 的给定指针时。我不太明白这里的 EXC_BAD_ACCESS (code=EXC_I386_GPFLT) 是什么。我之前对 hashset 做过几次单元测试,HashSetEnter() 函数hashfn 配合得很好。另一个单元测试是对 hash_docfreq() 函数进行的,它也可以正确计算哈希数。我有点困惑。任何人都可以帮忙吗?谢谢!

Process 89962 stopped
* thread #1,queue = 'com.apple.main-thread',stop reason = EXC_BAD_ACCESS (code=EXC_I386_GPFLT)
    frame #0: 0x0000000100003b83 testnothing`HashSetEnter(h=0x00007ffeefbff620,elemAddr=0x00007ffeefbff638) at test_nothing.c:130:13
   127  }
   128
   129  void HashSetEnter(hashset *h,const void *elemAddr) {
-> 130      int hash = h->hashfn(elemAddr,h->bucketNum);
   131      vector *bucket = (vector *)((char *)h->buckets + hash * sizeof(vector));
   132      // search in the hash set first
   133      int pos = VectorSearch(bucket,false);
Target 0: (testnothing) stopped.
(lldb) bt
* thread #1,stop reason = EXC_BAD_ACCESS (code=EXC_I386_GPFLT)
  * frame #0: 0x0000000100003b83 testnothing`HashSetEnter(h=0x00007ffeefbff620,elemAddr=0x00007ffeefbff638) at test_nothing.c:130:13
    frame #1: 0x0000000100003c37 testnothing`add_docfreq(wi=0x0000000100306060,docid=1611742826915764000,frequency=5) at test_nothing.c:222:2
    frame #2: 0x0000000100003cae testnothing`main at test_nothing.c:235:2
    frame #3: 0x00007fff70df0cc9 libdyld.dylib`start + 1
(lldb)

解决方法

gdb 下运行,故障后,执行 tb 命令以获取堆栈回溯,我们看到:

#0  0x00000005004016e6 in ?? ()
#1  0x000000000040163a in HashSetEnter (h=0x7fffffffdc10,elemAddr=0x7fffffffdc40) at orig.c:150
#2  0x0000000000401834 in add_docfreq (wi=0x405260,docid=1611742826915764000,frequency=5) at orig.c:266
#3  0x0000000000401879 in main () at orig.c:278
(gdb) frame 1
#1  0x000000000040163a in HashSetEnter (h=0x7fffffffdc10,elemAddr=0x7fffffffdc40) at orig.c:150
150     int hash = h->hashfn(elemAddr,h->bucketNum);

您在 HashSetEnter 行中出现段错误:

int hash = h->hashfn(elemAddr,h->bucketNum);

这是因为 h 在这一点上无效


查源,设置最终无效值的地方,设置在new_wordindex中。

new_wordindex 中,您正在保存 [并返回] h 的地址。

h 这里是函数作用域变量,所以函数返回后不再有效。

为此您必须使用 malloc。并且,稍后,您需要能够在清理期间 free 这个指针。


这是错误函数的重构代码。

请注意,为了显示旧/原始代码与新/更正代码,我使用了预处理器条件:

#if 0
// old/original code
// NOTE: this is _not_ compiled in
#else
// new/corrected code
// NOTE: this _is_ compiled in
#endif

#if 0 下的代码可以省略/删除,只留下 #else 代码。

static void
new_wordindex(word_index * wi,const char *word)
{
// NOTE/BUG: h is function scoped -- this can _not_ be saved and returned
// because it ceases to be valid when we return
#if 0
    hashset h;
    HashSetNew(&h,sizeof(doc_freq),kWordIndexHashSetBuckets,hash_docfreq,comp_docfreq,NULL);
    wi->freqs = &h;
#else
    hashset *h = malloc(sizeof(*h));
    HashSetNew(h,NULL);
    wi->freqs = h;
#endif

    size_t wordlen = strlen(word);

    wi->word = (char *) malloc(wordlen + 1);    // +1 for null-termination
    strcpy(wi->word,word);
    (wi->word)[wordlen] = '\0';
}