#!/usr/bin/env python # coding: utf-8 # # Session 3: File Input/Output # ## File I/O: Basic # * Data on a computer is usually stored in **files** # * From the view of the operating system, a file is just a sequence of # bits that is given a name # * What data is stored in a file and how exactly it is stored in a file # is defined in a **file format** # * The file format defines what the bits mean to the program that is reading/writing # the file # * ***Note:*** The **file extension** (e.g. whether the name of a file ends # in **.txt** or **.doc** does not determine the file format (it is # just a name) -- but it makes sense to name files according to their format # ## File I/O: Writing to a Text File # * A very common and useful file format is one where the sequence of bits # is interpreted as sequence of characters # * This conversion is performed with respect to a character set # (such as ASCII or UTF-8, but let's not worry about that here...) # * In Python, such **text files** can be manipulated very easily, by # reading/writing their contents to/from strings # * Using the `open()` function one can obtain a reference to a # **file** object that provides methods for reading and writing (e.g. # `read()` and `write()`) # ## File I/O: Text Files # ### File I/O: Writing to a text file: # Opening a text file for writing # In[ ]: f = open('my_first_file.txt', 'w') # In[ ]: f.write('Hello world!') # In[ ]: f.close() # We can now read this file again: # In[ ]: f = open('my_first_file.txt', 'r') # In[ ]: line = f.readline() # In[ ]: print(line) # In[ ]: f.close() # Write can be called multiple times to write more data: # In[ ]: f = open("animals.txt", "w") # In[ ]: for animal in ["Animal\tFood","Sloth\tLeaves", "Chicken\tCorn", "Ant_eater\tAnts", "Penguin\tFish", "Armadillo\tIce_cream\n"]: f.write("%s\n" % animal) # In[ ]: f.close() # ## File I/O: Reading from a Text File: # #### Reading the content of a text file using the `readlines()` function: # The `readlines()` function reads an entire text file into a list of strings, where each list entry corresponds to a line in the file # In[ ]: f = open("animals.txt", "r") # In[ ]: lines = f.readlines() # In[ ]: print(lines) # In[ ]: len(lines) # Because the entire file is first read into memory, this can be slow or # unfeasible for large files # Now print each line: # In[ ]: for l in lines: print(l) # In[ ]: for l in lines: print(l.rstrip()) # The `print` statement inserts `\n` after automatically, without removing the already present `\n` characters with `rstrip()` we end up with empty lines! # #### Reading the content of a text file line by line: # Because processing each line in a file is such a common operation, # Python provides the following simple syntax # In[ ]: f = open("animals.txt", "r") # In[ ]: for line in f: print(line.rstrip()) # In[ ]: f.close() # This iterates over the file line by line instead of reading in the whole content in the beginning! # #### And because python makes your life easy, here an even shorter version: # In[ ]: with open("animals.txt", "r") as infile: for line in infile: print(line.rstrip()) # Using `with` removes the necessity to call the `close()` function on your file object! # ## File I/O: Transforming a File: # * When working with data provided by other programs (and/or other # people), it is often necessary to convert data from one format to another # # The file that we wrote contained columns separated by tabs; what if we # need commas? # In[ ]: import os with open("animals.txt", "r") as infile: with open("animals.csv", "w") as outfile: for line in infile: outfile.write(",".join(line.split())) outfile.write('\n') # Lets check everything worked... # In[ ]: with open("animals.csv", "r") as infile: for line in infile: print(line.rstrip()) # Looking good! # ## File I/O Pickling: # * Text files are convenient when data needs to be exchanged with other # programs # * However, getting the data in/out of text files can be tedious # * If we know we only need the data within Python, there is a very easy # way to write arbitrary Python data structures to compact binary files # * This is generally referred to as **serialization**, but in # Python-lingo it's called **pickling** # * The **pickle** module and it's more efficient **cPickle** # version provide two functions, `dump()` and `load()`, that # allow writing and reading arbitrary Python objects # In[ ]: from pickle import dump, load # In[ ]: l = ["a", "list", "with", "stuff", [42, 23, 3.14], True] # In[ ]: with open("my_list.pkl", "wb") as f: dump(l, f) # In[ ]: with open("my_list.pkl", "rb") as f: l = load(f) l # ## File I/O Checking for Existence: # * Sometimes a program needs to check whether a file exists # * The `os.path` module provides the `exists()` function # In[4]: from os.path import exists # In[5]: if exists("lockfile"): print("Lockfile exists!") else: print("No lockfile found!") # In general, the `os` and `os.path` modules provide functions for manipulating the file systems. Don't try to reinvent the wheel - most things exist already in the Python standard library! # ## File I/O: Reading from the Web: # * In Python, there are several other objects that behave just like text # files # * One particularly useful one provides file-like access to resources on # the web: the `urlopen()` method in the `urllib2` module # In[11]: from urllib.request import urlopen # In[12]: URL = "http://www.gutenberg.org/cache/epub/28885/pg28885.txt" # In[13]: if not exists("alice.txt"): f = urllib.urlopen(URL) with open("alice.txt", "wb") as outfile: outfile.write(f.read()) # In[14]: print(''.join(open("alice.txt").readlines()[970:975])) # In[15]: with open("alice.txt", "rb") as infile: book = infile.readlines() print("".join(book[1000:1005])) # ## File I/O Multiple Files: # The `glob` module provides an easy way to find all files with certain names (e.g. all files with names that end in `.txt`) # In[ ]: import glob # In[ ]: text_files = glob.glob("*.txt") # In[ ]: for t in text_files: print(t) # ## File I/O Terminal streams: # * The terminal input/output streams can also be accessed like filesusing the `stdin` and `stdout` objects from the `sys` module # In[ ]: import sys # In[ ]: sys.stdout.write("Another way to print!\n")