
哈希表是一种基于数组的数据结构,它通过哈希函数将键映射到数组索引,实现快速的数据访问。Python的字典和集合都是基于哈希表实现的。
# 简单的哈希函数示例
def simple_hash(key, table_size):
if isinstance(key, str):
# 使用字符的ASCII值之和
return sum(ord(c) for c in key) % table_size
# 对于数字,直接使用模运算
return key % table_size
# 演示哈希函数
print(simple_hash("hello", 10)) # 将字符串映射到0-9的范围
print(simple_hash(42, 10)) # 将数字映射到0-9的范围Python使用开放寻址法处理哈希冲突:
class SimpleHashTable:
def __init__(self, size):
self.size = size
self.table = [None] * size
self.count = 0
def _hash(self, key):
return simple_hash(key, self.size)
def _find_slot(self, key):
hash_value = self._hash(key)
original = hash_value
while True:
if self.table[hash_value] is None or \
self.table[hash_value][0] == key:
return hash_value
# 线性探测
hash_value = (hash_value + 1) % self.size
if hash_value == original:
raise Exception("Hash table is full")
def put(self, key, value):
slot = self._find_slot(key)
if self.table[slot] is None:
self.count += 1
self.table[slot] = (key, value)
def get(self, key):
slot = self._find_slot(key)
if self.table[slot] is None:
raise KeyError(key)
return self.table[slot][1]
# 使用示例
ht = SimpleHashTable(10)
ht.put("hello", 42)
ht.put("world", 24)
print(ht.get("hello")) # 输出: 42Python字典使用分离的数组来存储哈希值、键和值:
import sys
# 创建字典并查看内存使用
d = {'a': 1, 'b': 2, 'c': 3}
print(f"字典占用内存: {sys.getsizeof(d)} bytes")
# 创建相同大小的列表作为对比
l = ['a', 'b', 'c']
print(f"列表占用内存: {sys.getsizeof(l)} bytes")def monitor_dict_size():
d = {}
initial_size = sys.getsizeof(d)
print(f"空字典大小: {initial_size} bytes")
# 添加元素并监控大小变化
for i in range(10):
d[i] = i
current_size = sys.getsizeof(d)
print(f"添加 {i} 后的大小: {current_size} bytes")
monitor_dict_size()import timeit
def measure_dict_performance():
# 创建测试数据
d = {i: i for i in range(10000)}
# 测试查找性能
lookup_time = timeit.timeit(
lambda: d[5000],
number=100000
)
# 测试插入性能
insert_time = timeit.timeit(
lambda: d.update({len(d): len(d)}),
number=100000
)
print(f"查找时间: {lookup_time:.6f} 秒")
print(f"插入时间: {insert_time:.6f} 秒")
measure_dict_performance()def analyze_set_memory():
# 创建不同大小的集合
s1 = set(range(10))
s2 = set(range(100))
s3 = set(range(1000))
print(f"10个元素的集合: {sys.getsizeof(s1)} bytes")
print(f"100个元素的集合: {sys.getsizeof(s2)} bytes")
print(f"1000个元素的集合: {sys.getsizeof(s3)} bytes")
analyze_set_memory()def demonstrate_set_operations():
s1 = {1, 2, 3, 4, 5}
s2 = {4, 5, 6, 7, 8}
# 计时各种集合操作
def time_operation(op, stmt):
time = timeit.timeit(stmt, globals=locals(), number=100000)
print(f"{op}: {time:.6f} 秒")
time_operation("交集", "s1 & s2")
time_operation("并集", "s1 | s2")
time_operation("差集", "s1 - s2")
time_operation("对称差集", "s1 ^ s2")
demonstrate_set_operations()def compare_dict_creation():
# 不预分配空间
def create_normal():
d = {}
for i in range(10000):
d[i] = i
# 预分配空间
def create_with_dict_comp():
d = {i: i for i in range(10000)}
normal_time = timeit.timeit(create_normal, number=100)
comp_time = timeit.timeit(create_with_dict_comp, number=100)
print(f"普通创建时间: {normal_time:.6f} 秒")
print(f"预分配创建时间: {comp_time:.6f} 秒")
compare_dict_creation()def demonstrate_resize_impact():
def add_one_by_one():
s = set()
for i in range(1000):
s.add(i)
def add_all_at_once():
s = set(range(1000))
time1 = timeit.timeit(add_one_by_one, number=1000)
time2 = timeit.timeit(add_all_at_once, number=1000)
print(f"逐个添加时间: {time1:.6f} 秒")
print(f"一次性添加时间: {time2:.6f} 秒")
demonstrate_resize_impact()def analyze_hash_distribution():
# 分析不同类型的哈希分布
def check_hash_distribution(items):
hashes = [hash(item) for item in items]
unique_hashes = len(set(hashes))
return unique_hashes / len(hashes)
# 测试不同类型的数据
strings = [f"str{i}" for i in range(1000)]
numbers = list(range(1000))
tuples = [(i, i+1) for i in range(1000)]
print(f"字符串哈希分布: {check_hash_distribution(strings):.2%}")
print(f"数字哈希分布: {check_hash_distribution(numbers):.2%}")
print(f"元组哈希分布: {check_hash_distribution(tuples):.2%}")
analyze_hash_distribution()def demonstrate_memory_reuse():
# 创建和删除字典
d1 = dict.fromkeys(range(1000))
initial_size = sys.getsizeof(d1)
del d1
# 创建新字典
d2 = dict.fromkeys(range(1000))
new_size = sys.getsizeof(d2)
print(f"初始字典大小: {initial_size} bytes")
print(f"新字典大小: {new_size} bytes")
demonstrate_memory_reuse()def compare_data_structures():
# 准备测试数据
data = list(range(10000))
# 测试查找性能
def test_list_lookup():
return 5000 in data
def test_set_lookup():
return 5000 in set(data)
def test_dict_lookup():
return 5000 in dict.fromkeys(data)
# 执行测试
list_time = timeit.timeit(test_list_lookup, number=1000)
set_time = timeit.timeit(test_set_lookup, number=1000)
dict_time = timeit.timeit(test_dict_lookup, number=1000)
print(f"列表查找时间: {list_time:.6f} 秒")
print(f"集合查找时间: {set_time:.6f} 秒")
print(f"字典查找时间: {dict_time:.6f} 秒")
compare_data_structures()def compare_memory_usage():
# 创建相同大小的不同数据结构
data = range(10000)
l = list(data)
s = set(data)
d = dict.fromkeys(data)
print(f"列表内存使用: {sys.getsizeof(l)} bytes")
print(f"集合内存使用: {sys.getsizeof(s)} bytes")
print(f"字典内存使用: {sys.getsizeof(d)} bytes")
compare_memory_usage()# 当知道大致元素数量时,使用字典/集合推导式
known_size_dict = {i: i for i in range(1000)}
known_size_set = {i for i in range(1000)}# 需要频繁查找时使用集合而不是列表
def find_duplicates(items):
seen = set()
duplicates = set()
for item in items:
if item in seen:
duplicates.add(item)
seen.add(item)
return duplicates# 不好的实践
def bad_practice(items):
return list(set(list(items)))
# 好的实践
def good_practice(items):
return list(set(items))# 使用字典推导式而不是循环
squares = {x: x**2 for x in range(100)}
# 使用集合解析式进行过滤
even_numbers = {x for x in range(100) if x % 2 == 0}通过理解字典和集合的底层实现,我们可以更好地利用这些数据结构,编写更高效的Python代码。记住,在实际应用中,要根据具体场景选择合适的数据结构和优化策略。