Calculate the average number of groups (residues) for protein chains.
from pyspark.sql import SparkSession
from mmtfPyspark.io import mmtfReader
from mmtfPyspark.filters import ContainsLProteinChain
from mmtfPyspark.mappers import StructureToPolymerChains
spark = SparkSession.builder.appName("Solution-2").getOrCreate()
path = "../resources/mmtf_reduced_sample/"
pdb = mmtfReader.read_sequence_file(path)
prot_chains = pdb.flatMap(StructureToPolymerChains())\
.filter(ContainsLProteinChain())
total_groups = prot_chains.map(lambda t: t[1].num_groups).reduce(lambda a, b: a+b)
total_groups/prot_chains.count()
197.10918825374506
spark.stop()