一个Python和C性能对比的好例子


VCKbase的论坛上有人提了一个问题:
遇到的问题是,我把E-mail地址收集在txt文件中(每行一个E-mail地址,共78000行)
,要求是把其中重复的E-mail地址全部去掉。考虑编程方便,我用了CString类,但似乎
效率不高。我写出来的整理78000份E-mail地址的需要近7分钟的时间
我给他用Python实现了一个(这个版本是修改过的,当时我随手实现的一个版本做错了,呵呵)
这里算法的主要内容就是利用hash_table来判断email是否已经重复出现,这比字符串的查找要快的多。
#remove duplicated email address from file
import datetime
if __name__ == "__main__":
        t = datetime.datetime(2000,1,1)
        print str(t.today())
        hashtable = {}
        f = file("email.txt","r")
        f2 = file("email_new.txt","w")
        line = f.readline();
        while len(line)>0:
                if not hashtable.has_key(line):
                        hashtable[line] = 1
                        f2.write(line)
                line = f.readline();
        f.close()
        f2.close()
        t2 = datetime.datetime(2000,1,1)
        print str(t2.today())
               
上面的算法在我的机器上:P4 2.8G,512M 内存,耗时大约300ms,跟他的7分钟差了天了。
而我当时错误的算法,需要2分半钟,所以我贴到我们内部的BBS上,我的一个同学实现了如下算法(C):
#include
#include
#include
#include "list.h"
#define HASH_SIZE 262157
#define LINE_MAX_LEN 1024
struct list_node {
        struct list_head head;
        int len;
        char *str;
};
struct list_head hash_table[HASH_SIZE];
static inline int
hash_string (const char *ptr, int len)
{
        unsigned int hash = 0;
       
        for (hash = 0; len; len--, ptr++) {
                /* (31 * hash) will probably be optimized to ((hash len == len) && (memcmp(node->str, str, len) == 0));
}
int main(int argc, char **argv)
{
        int i, hash, len;
        struct list_node *node;
        char buf[LINE_MAX_LEN];
        FILE *fp_in, *fp_out;
       
        if(argc != 3) {
                printf("error: \nplease use eml as:\n eml infile outfile\n");
                return -1;
        }
        fp_in = fopen(argv[1], "r");
        if(!fp_in) {
                printf("error: could not open infile %s\n", argv[1]);
                return -1;
        }
        fp_out = fopen(argv[2], "w");
        if(!fp_out) {
                printf("error: could not open infile %s\n", argv[2]);
                return -1;
        }
        for(i=0; istr = (char *)malloc(len + 1);
                        node->len = len;
                        memcpy(node->str, buf, len);
                        node->str[len] = '\0';
                        list_append(&(hash_table[hash]), node);
                }
        }
        return 0;
}
还有.h文件:
#ifndef _LINUX_LIST_H
#define _LINUX_LIST_H
extern inline void prefetch(const void *x)
{
        return;
}
/*
* Simple doubly linked list implementation.
*
* Some of the internal functions ("__xxx") are useful when
* manipulating whole lists rather than single entries, as
* sometimes we already know the next/prev entries and we can
* generate better code by using them directly rather than
* using the generic single-entry routines.
*/
struct list_head {
        struct list_head *next, *prev;
};
#define LIST_HEAD_INIT(name) { &(name), &(name) }
#define LIST_HEAD(name) \
        struct list_head name = LIST_HEAD_INIT(name)
#define INIT_LIST_HEAD(ptr) do { \
        (ptr)->next = (ptr); (ptr)->prev = (ptr); \
} while (0)
/*
* Insert a new entry between two known consecutive entries.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void
__list_add(struct list_head *new,
           struct list_head *prev, struct list_head *next)
{
        next->prev = new;
        new->next = next;
        new->prev = prev;
        prev->next = new;
}
/**
* list_add - add a new entry
* @new: new entry to be added
* @head: list head to add it after
*
* Insert a new entry after the specified head.
* This is good for implementing stacks.
*/
static inline void
list_add(struct list_head *new, struct list_head *head)
{
        __list_add(new, head, head->next);
}
/**
* list_add_tail - add a new entry
* @new: new entry to be added
* @head: list head to add it before
*
* Insert a new entry before the specified head.
* This is useful for implementing queues.
*/
static inline void
list_add_tail(struct list_head *new, struct list_head *head)
{
        __list_add(new, head->prev, head);
}
/*
* Delete a list entry by making the prev/next entries
* point to each other.
*
* This is only for internal list manipulation where we know
* the prev/next entries already!
*/
static inline void
__list_del(struct list_head *prev, struct list_head *next)
{
        next->prev = prev;
        prev->next = next;
}
/**
* list_del - deletes entry from list.
* @entry: the element to delete from the list.
* Note: list_empty on entry does not return true after this, the entry is in an undefined state.
*/
static inline void
list_del(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        entry->next = (void *) 0;
        entry->prev = (void *) 0;
}
/**
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
static inline void
list_del_init(struct list_head *entry)
{
        __list_del(entry->prev, entry->next);
        INIT_LIST_HEAD(entry);
}
/**
* list_move - delete from one list and add as another's head
* @list: the entry to move
* @head: the head that will precede our entry
*/
static inline void
list_move(struct list_head *list, struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add(list, head);
}
/**
* list_move_tail - delete from one list and add as another's tail
* @list: the entry to move
* @head: the head that will follow our entry
*/
static inline void
list_move_tail(struct list_head *list, struct list_head *head)
{
        __list_del(list->prev, list->next);
        list_add_tail(list, head);
}
/**
* list_empty - tests whether a list is empty
* @head: the list to test.
*/
static inline int
list_empty(struct list_head *head)
{
        return head->next == head;
}
static inline void
__list_splice(struct list_head *list, struct list_head *head)
{
        struct list_head *first = list->next;
        struct list_head *last = list->prev;
        struct list_head *at = head->next;
        first->prev = head;
        head->next = first;
        last->next = at;
        at->prev = last;
}
/**
* list_splice - join two lists
* @list: the new list to add.
* @head: the place to add it in the first list.
*/
static inline void
list_splice(struct list_head *list, struct list_head *head)
{
        if (!list_empty(list))
                __list_splice(list, head);
}
/**
* list_splice_init - join two lists and reinitialise the emptied list.
* @list: the new list to add.
* @head: the place to add it in the first list.
*
* The list at @list is reinitialised
*/
static inline void
list_splice_init(struct list_head *list, struct list_head *head)
{
        if (!list_empty(list)) {
                __list_splice(list, head);
                INIT_LIST_HEAD(list);
        }
}
/**
* list_entry - get the struct for this entry
* @ptr:        the &struct list_head pointer.
* @type:        the type of the struct this is embedded in.
* @member:        the name of the list_struct within the struct.
*/
#define list_entry(ptr, type, member) \
        ((type *)((char *)(ptr)-(unsigned long)(&((type *)0)->member)))
/**
* list_for_each        -        iterate over a list
* @pos:        the &struct list_head to use as a loop counter.
* @head:        the head for your list.
*/
#define list_for_each(pos, head) \
        for (pos = (head)->next, prefetch(pos->next); pos != (head); \
                pos = pos->next, prefetch(pos->next))
/**
* list_for_each_safe        -        iterate over a list safe against removal of list entry
* @pos:        the &struct list_head to use as a loop counter.
* @n:                another &struct list_head to use as temporary storage
* @head:        the head for your list.
*/
#define list_for_each_safe(pos, n, head) \
        for (pos = (head)->next, n = pos->next; pos != (head); \
                pos = n, n = pos->next)
/**
* list_for_each_entry        -        iterate over list of given type
* @pos:        the type * to use as a loop counter.
* @head:        the head for your list.
* @member:        the name of the list_struct within the struct.
*/
#define list_for_each_entry(pos, head, member)                                \
        for (pos = list_entry((head)->next, typeof(*pos), member),        \
                     prefetch(pos->member.next);                        \
             &pos->member != (head);                                         \
             pos = list_entry(pos->member.next, typeof(*pos), member),        \
                     prefetch(pos->member.next))
/*this is the begin of listhelp ;-)*/
#define LIST_FIND(head, cmpfn, type, args...)                \
({                                                        \
        const struct list_head *__i = (head);                \
                                                        \
        do {                                                \
                __i = __i->next;                        \
                if (__i == (head)) {                        \
                        __i = NULL;                        \
                        break;                                \
                }                                        \
        } while (!cmpfn((const type)__i , ## args));        \
        (type)__i;                                        \
})
#define LIST_FIND_W(head, cmpfn, type, args...)        \
({                                                \
        const struct list_head *__i = (head);        \
                                                \
        do {                                        \
                __i = __i->next;                \
                if (__i == (head)) {                \
                        __i = NULL;                \
                        break;                        \
                }                                \
        } while (!cmpfn((type)__i , ## args));        \
        (type)__i;                                \
})
static inline int
__list_cmp_same(const void *p1, const void *p2)
{
        return p1 == p2;
}
/* Is this entry in the list? */
static inline int
list_inlist(struct list_head *head, const void *entry)
{
        return LIST_FIND(head, __list_cmp_same, void *, entry) !=NULL;
}
/* Delete from list. */
#ifdef CONFIG_NETFILTER_DEBUG
#define LIST_DELETE(head, oldentry)                                        \
do {                                                                        \
        if (!list_inlist(head, oldentry))                                \
                printk("LIST_DELETE: %s:%u `%s'(%p) not in %s.\n",        \
                       __FILE__, __LINE__, #oldentry, oldentry, #head);        \
        else list_del((struct list_head *)oldentry);                        \
} while(0)
#else
#define LIST_DELETE(head, oldentry) list_del((struct list_head *)oldentry)
#endif
/* Append. */
static inline void
list_append(struct list_head *head, void *new)
{
        list_add((new), (head)->prev);
}
/* Prepend. */
static inline void
list_prepend(struct list_head *head, void *new)
{
        list_add(new, head);
}
/* Insert according to ordering function; insert before first true. */
#define LIST_INSERT(head, new, cmpfn)                                \
do {                                                                \
        struct list_head *__i;                                        \
        for (__i = (head)->next;                                \
             !cmpfn((new), (typeof (new))__i) && __i != (head);        \
             __i = __i->next);                                        \
        list_add((struct list_head *)(new), __i->prev);                \
} while(0)
/* If the field after the list_head is a nul-terminated string, you
   can use these functions. */
static inline int
__list_cmp_name(const void *i, const char *name)
{
        return strcmp(name, i + sizeof (struct list_head)) == 0;
}
static inline int
__list_cmp_nameptr(const void *i, const char *name)
{
        return strcmp(name, ((char *)
                             *(unsigned long *) (i +
                                                 sizeof (struct list_head)))) ==
            0;
}
/* Returns false if same name already in list, otherwise does insert. */
static inline int
list_named_insert(struct list_head *head, void *new)
{
        if (LIST_FIND(head, __list_cmp_name, void *,
                      new + sizeof (struct list_head)))
                 return 0;
        list_prepend(head, new);
        return 1;
}
static inline int
list_nameptr_insert(struct list_head *head, void *new)
{
        if (LIST_FIND(head, __list_cmp_nameptr, void *,
                      new + sizeof (struct list_head)))
                 return 0;
        list_prepend(head, new);
        return 1;
}
/* Find this named element in the list. */
#define list_named_find(head, name)                        \
LIST_FIND(head, __list_cmp_name, void *, name)
#define list_nameptr_find(head, name)                        \
LIST_FIND(head, __list_cmp_nameptr, void *, name)
#endif
在我的机器上测试(cygwin):耗时竟然是400ms!
当然,这里包含了Cygwin的间接调用开销。
很难想象吧,Python在这里的性能竟然跟C不相上下!而且Python的代码比C不知道短了多少倍!
所以讲,Python是蛮好用的,:-).
不过,这个例子其实是个特殊情况,以前我们玩过的另外一个程序,C的性能就比Python快得多。不过代码量还是Python简短得多。
有兴趣的朋友可以用自己熟悉的语言来测试一下,下面是测试文件(也是我用11行python代码产生的,有1/10的email是重复的):
这其实是一个rar,下载后请修改扩展名