Learn how to write beautiful, idiomatic Python code that will improve readability and performance.
These are some idiomatic Pythonic
ways to write code based on this video by Raymond Hettinger. Under each major section, you will see two sub-sections: Don't do this
and Do this
. Code under Don't do this
are discouraged, and following the adjective of Jeff Knupp, are harmful
. Code under Do this
are the encouraged, beautiful
and idiomatic
Pythonic way to write the code instead. However, as you will see, some code examples are provided for speed performance.
Additional idiomatic Pythonic syntax has also been added in while some from the original video were left out (we will try to find alternative working examples).
for i in [0, 1, 2, 3, 4, 5]:
print(i ** 2)
for i in range(6):
print(i ** 2)
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
for i in range(len(names)):
print(names[i])
for name in names:
print(name)
for i in range(len(names) - 1, -1, -1):
print(names[i])
for name in reversed(names):
print(name)
for i in range(len(names)):
print(i, names[i])
for i, name in enumerate(names):
print(i, name)
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
colors = ['red', 'green', 'blue', 'orange', 'purple', 'pink']
n = min(len(names), len(colors))
for i in range(n):
print(names[i], colors[i])
for name, color in zip(names, colors):
print(name, color)
Here, we need to flatten an array of arrays into one array. Notice that the second discouraged approach is actually the fastest (faster than the encouraged approaches)? The setup with the x
array and use of a for loop spans 3 lines. This example appears controversial with trading off idiomatic Python for speed.
data = [list(range(10000000)) for _ in range(10)]
%%time
x = []
for arr in data:
for val in arr:
x.append(val)
len(x)
%%time
x = []
for arr in data:
x.extend(arr)
len(x)
%%time
x = [val for arr in data for val in arr]
len(x)
%%time
import itertools
x = itertools.chain.from_iterable(data)
len(list(x))
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
d = {}
for name in names:
key = len(name)
if key not in d:
d[key] = []
d[key].append(name)
print(d)
from collections import defaultdict
d = defaultdict(list)
for name in names:
key = len(name)
d[key].append(name)
print(d)
import itertools
key = lambda s: len(s)
d = {k: list(g) for k, g in itertools.groupby(sorted(names, key=key), key)}
print(d)
data = [i for i in range(10000000)]
%%time
x = []
for val in data:
x.append(val * 2)
y = []
for val in x:
if val % 2 == 0:
y.append(val)
z = 0
for val in y:
z = z + val
print(z)
%%time
from functools import reduce
x = map(lambda val: val * 2, data)
x = filter(lambda val: val % 2 == 0, x)
x = reduce(lambda val1, val2: val1 + val2, x)
print(x)
d = {
'username': 'jdoe'
}
is_authorized = False
if 'auth_token' in d:
is_authorized = True
print(is_authorized)
is_authorized = d.get('auth_token', False)
print(is_authorized)
d1 = {'color': 'red', 'user': 'jdoe'}
d2 = {'color': 'blue', 'first_name': 'john', 'last_name': 'doe'}
d = d1.copy()
d.update(d2)
for k, v in d.items():
print(k, v)
from collections import ChainMap
d1 = {'color': 'red', 'user': 'jdoe'}
d2 = {'color': 'blue', 'first_name': 'john', 'last_name': 'doe'}
d = ChainMap(d2, d1)
for k, v in d.items():
print(k, v)
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
d = {}
for name in names:
key = len(name)
if key not in d:
d[key] = 0
d[key] = d[key] + 1
print(d)
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
d = defaultdict(int)
for name in names:
key = len(name)
d[key] = d[key] + 1
print(d)
from collections import Counter
d = Counter()
for name in names:
key = len(name)
d[key] = d[key] + 1
print(d)
d = Counter(map(lambda s: len(s), names))
print(d)
def get_info():
return 'John', 'Doe', 28
fname, lname, tmp = get_info()
print(fname, lname)
def get_info():
return 'John', 'Doe', 28
fname, lname, _ = get_info()
print(fname, lname)
scores = [80, 90, 95, 88, 99, 93]
students = [(name, score) for name, score in zip(names, scores)]
for student in students:
print('{} {}'.format(student[0], student[1]))
from collections import namedtuple
Student = namedtuple('Student', 'name score')
students = [Student(name, score) for name, score in zip(names, scores)]
for student in students:
print('{} {}'.format(student.name, student.score))
The key is to avoid long code that breaks up the coherent intention. In the discouraged approach, we receive a tuple, and store it in s
and then for each element in s
, use a different line to access the values. In the encouraged approach, the tuple is unpacked neatly into one line.
def get_student():
return 'john', 'doe', 88
s = get_student()
first_name = s[0]
last_name = s[1]
score = s[2]
print(first_name, last_name, score)
first_name, last_name, score = get_student()
print(first_name, last_name, score)
s = ''
for i, name in enumerate(names):
s += name
if i < len(names) - 1:
s += ', '
s
', '.join(names)
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
del names[0]
print(names)
names.pop(0)
print(names)
names.insert(0, 'jerry')
print(names)
from collections import deque
names = ['john', 'jane', 'jeremy', 'janice', 'joyce', 'jonathan']
names.remove('john')
print(names)
names.pop(0)
print(names)
names.insert(0, 'jerry')
print(names)
The key here is to use the lru_cache
decorator to cache results of functions that are idempotent, especially if they are expensive to call. Note how calls to add
takes about 700 milliseconds? However, using the lru_cache
decorator, subsequent calls are on the order of microseconds.
def add(n):
return sum([i for i in range(n)])
%%time
add(10000000)
%%time
add(10000000)
from functools import lru_cache
@lru_cache(maxsize=32)
def add(n):
return sum([i for i in range(n)])
%%time
add(10000000)
%%time
add(10000000)
f = open('README.md')
try:
data = f.read()
print(len(data))
finally:
f.close()
with open('README.md') as f:
data = f.read()
print(len(data))
import os
try:
os.remove('test.tmp')
except OSError:
pass
from contextlib import suppress
with suppress(OSError):
os.remove('test.tmp')
The key here is to avoid looping over elements and storing results. Instead, use a for or generator comprehension. Note that the for
(note the brackets) comprehension eagerly evaluates the expressions and returns a list, but the generator
(note the parentheses) lazily evaluates the expressions.
results = []
for i in range(10):
s = i ** 2
results.append(s)
total = sum(results)
print(total)
total = sum([i ** 2 for i in range(10)])
print([i ** 2 for i in range(10)])
print(total)
total = sum((i ** 2 for i in range(10)))
print((i ** 2 for i in range(10)))
print(total)
nums = []
for i in range(100):
if i % 2 == 0:
nums.append(i)
print(nums)
nums = [i for i in range(100) if i % 2 == 0]
print(nums)
def format_information(first_name, last_name, age):
return '{} {} is {} years old'.format(first_name, last_name, age)
format_information('John', 'Doe', 28)
format_information(first_name='John', last_name='Doe', age=28)
format_information(**{
'first_name': 'John',
'last_name': 'Doe',
'age': 28
})
def update_x(x):
return x + 1
def update_y(y):
return y + 1
x = 3
y = 4
dx = 4
dy = 5
tmp_x = x + dx
tmp_y = y + dy
tmp_dx = update_x(x)
tmp_dy = update_y(y)
x = tmp_x
y = tmp_y
dx = tmp_dx
dy = tmp_dy
print(x, y, dx, dy)
x = 3
y = 4
dx = 4
dy = 5
x, y, dx, dy = (x + dx, y + dy, update_x(x), update_y(y))
print(x, y, dx, dy)
def add_one(x):
return x + 1
add_one(3)
add_one = lambda x: x + 1
add_one(3)
%%time
def generate_sequential_numbers(n):
nums = []
for i in range(n):
nums.append(i)
return nums
sum(generate_sequential_numbers(10000000))
%%time
def generate_sequential_numbers(n):
for i in range(n):
yield i
sum(generate_sequential_numbers(10000000))
%%time
generate_sequential_numbers = lambda n: (i for i in range(n))
sum(generate_sequential_numbers(10000000))
Here, we want to create two dictionaries; index-to-word i2w
and word-to-index w2i
. In the discouraged approach, we create two dictionaries, use a for loop, and set the key-value pair with the help of enumerate
; there are 5 lines of code. In the encouraged approach, using two lines of code, we can declare and instantiate the dictionaries with a for comprehension.
words = ['i', 'like', 'to', 'eat', 'pizza', 'and', 'play', 'tennis']
i2w = {}
w2i = {}
for i, word in enumerate(words):
i2w[i] = word
w2i[word] = i
print(i2w)
print(w2i)
i2w = {i: word for i, word in enumerate(words)}
w2i = {word: i for i, word in enumerate(words)}
print(i2w)
print(w2i)
Set comprehension avoids for loops.
words = ['i', 'like', 'to', 'eat', 'pizza', 'and', 'play', 'tennis']
vocab = set()
for word in words:
vocab.add(word)
print(vocab)
vocab = {word for word in words}
print(vocab)
x = 10
y = 15
z = 20
if x <= y and y <= z:
print('hi')
if x <= y <= z:
print('hi')
is_male = True
if is_male == True:
print('is male is true')
if is_male:
print('is male is true')
is_male = True
if is_male:
gender = 'male'
else:
gender = 'female'
print(gender)
gender = 'male' if is_male else 'female'
print(gender)
name = 'John'
food = 'pizza'
sport = 'tennis'
sentence = '{} likes to eat {}. {} likes to play {}.'.format(name, food, name, sport)
print(sentence)
name = 'John'
food = 'pizza'
sport = 'tennis'
# variable substitution
sentence = '{name} likes to eat {}. {name} likes to play {}.'.format(food, sport, name=name)
print(sentence)
# f-string
sentence = f'{name} likes to eat {food}. {name} likes to play {sport}.'
print(sentence)
# string template
from string import Template
sentence = Template('$name likes to eat $food. $name likes to play $sport.')
print(sentence.substitute(name=name, food=food, sport=sport))
print('---------------')
print('-'*15)
class Student():
def __init__(self, first_name, last_name):
self.first_name = first_name
self.last_name = last_name
student = Student('John', 'Doe')
print(student)
class Student():
def __init__(self, first_name, last_name):
self.first_name = first_name
self.last_name = last_name
def __str__(self):
return f'{self.first_name} {self.last_name}'
student = Student('John', 'Doe')
print(student)
symbols = ['A', 'B', 'C', 'D']
combinations = []
for i, symbol_i in enumerate(symbols):
for j, symbol_j in enumerate(symbols):
if i < j:
tup = symbol_i, symbol_j
combinations.append(tup)
print(combinations)
from itertools import combinations
combinations = (comb for comb in combinations(symbols, 2) if comb[0] != comb[1])
print(list(combinations))
colors = ['red', 'green', 'blue']
color_sequence = []
index = 0
for i in range(10):
color_sequence.append(colors[index])
index += 1
if index == 3:
index = 0
print(color_sequence)
from itertools import cycle
color_cycle = cycle(colors)
color_sequence = (next(color_cycle) for _ in range(10))
print(list(color_sequence))
a = ['cat', 'dog', 'frog']
b = ['red', 'green', 'blue']
c = ['big', 'small']
product_list = []
for animal in a:
for color in b:
for size in c:
tup = animal, color, size
product_list.append(tup)
print(product_list)
from itertools import product
product_list = product(a, b, c)
print(list(product_list))
list_of_list = [a, b, c]
product_list = product(*list_of_list)
print(list(product_list))
If you are working with enumerations, use the enum
package. In the example below, we have students who may be part, half or full time. If we simply declared these states with normal variables, they may be overwritten and there will be no context. On the other hand, if we use IntEnum
, once declared, these states are immutable and provide context.
PART_TIME = 1
HALF_TIME = 2
FULL_TIME = 3
from enum import IntEnum
class StudentType(IntEnum):
PART_TIME = 1
HALF_TIME = 2
FULL_TIME = 3
print(StudentType.PART_TIME)
print(StudentType.HALF_TIME)
print(StudentType.FULL_TIME)
# Example 1
files = ['one.txt', 'two.py', 'three.txt', 'four.py', 'five.scala', 'six.java', 'seven.py']
py_files = filter(lambda f: f.endswith('.py'), files)
print(list(py_files))
# Example 2
import os
def traverse(path):
for basepath, directories, files in os.walk(path):
for f in files:
if f.endswith('.ipynb'):
yield os.path.join(basepath, f)
ipynb_files = traverse('../')
len(list(ipynb_files))
# Example 1
import fnmatch
fnmatch.filter(files, '*.py')
# Example 2
ipynb_files = fnmatch.filter(
(f for basepath, directories, files in os.walk('../') for f in files),
'*.ipynb')
len(list(ipynb_files))
# Example 2, even better
import pathlib
ipynb_files = pathlib.Path('../').glob('**/*.ipynb')
len(list(ipynb_files))
import pickle
object_1 = 'pretend some big object 1'
object_2 = 'pretend some big object 2'
data = {
'object_1': object_1,
'object_2': object_2,
}
pickle.dump(data, open('data.p', 'wb'))
data = pickle.load(open('data.p', 'rb'))
print(data['object_1'])
print(data['object_2'])
import shelve
with shelve.open('data') as s:
s['object_1'] = object_1
s['object_2'] = object_2
with shelve.open('data') as s:
print(s['object_1'])
print(s['object_2'])
When operating over Pandas dataframes, avoid using for loops and favor the apply
function and Numpy vectorization
.
import numpy as np
import pandas as pd
np.random.seed(37)
def get_df():
N = 10000
M = 50
get_x = lambda x: np.random.normal(x, 1, N).reshape(-1, 1)
get_y = lambda x: np.full(N, -1).reshape(-1, 1)
X = np.hstack([get_x(x) if x < M - 1 else get_y(x) for x in range(M)])
columns=[f'X{i}' if i < M - 1 else 'y' for i in range(M)]
return pd.DataFrame(
X,
columns=columns
)
df = get_df()
Standard for loop
.
%%time
for row in range(len(df)):
total = np.sum(df.iloc[row][0:df.shape[1] - 1])
y = 1 if total > 1175 else 0
df['y'].iloc[row] = y
Pandas iterrows
.
%%time
for i, r in df.iterrows():
total = np.sum(r[0:df.shape[1] - 1])
y = 1 if total > 1175 else 0
df['y'].iloc[row] = y
Pandas apply
.
%%time
df['y'] = df.apply(lambda r: 1 if np.sum(r[0:df.shape[1] - 1]) > 1175 else 0, axis=1)
Numpy vectorization
. The approach below uses 3 lines to be clear about the intention, but the amount of time is in the milliseconds scale.
%%time
f = lambda s: 1 if s > 1175 else 0
s = df[[c for c in df.columns if c != 'y']].values.sum(axis=1)
df['y'] = [f(val) for val in s]