import sys
# force the notebook to look for files in the upper level directory
sys.path.insert(1, '../')
import pandas as pd
from glob import glob
import pymatgen as mg
from data.compound_featurizer import read_new_struct, \
get_struct, get_elem_info, get_elem_distances, \
calc_mm_dists, calc_mx_dists, calc_xx_dists, calc_elem_max_potential
# initialize an empty list of dataframes
df_lst = []
# iterate over all the cif files
for struct_file_path in glob("./user_defined_structures/featurizer_sub_function_demo/*.cif"):
# add the newly read in dataframe to the list
df_lst.append(read_new_struct(struct_file_path))
# concatenate all the dataframes in the list
df = pd.concat(df_lst, ignore_index=True)
# assign oxidation states to BaTiO3 and Mg2AlFeO5
df.at[df[df.Compound == "BaTiO3"].index[0], "structure"].add_oxidation_state_by_element({"Ba": 2, "Ti": 4, "O": -2})
df.at[df[df.Compound == "Mg2AlFeO5"].index[0], "structure"].add_oxidation_state_by_element({"Mg": 2, "Al": 3, "Fe": 3, "O": -2})
# here is a print out of the dataframe
df
Compound | structure | |
---|---|---|
0 | Mg2AlFeO5 | [[0.1798251 1.58702 2.644008 ] Mg2+, [ 6.407... |
1 | La2.8Mg1.2Mn4O12 | [[0.12084071 1.929845 5.45406422] La:0.700, ... |
2 | BaTiO3 | [[0.00849286 0.00844357 0.00854272] Ba2+, [2.1... |
Since we've already read in all the structures in dataframe, we can access the individual Pymatgen structure using the compound formula.
Tip: when you have questions about a specific function, you can always go to the original .py file or you can press ⇧ Shift + ⇥ Tab for its docstring
test_struct = get_struct("BaTiO3", df)
test_struct
Structure Summary Lattice abc : 4.081131019999999 4.08113102 4.08113102 angles : 89.66458222000001 89.66458222000001 89.66458222000001 volume : 67.97032918428083 A : 4.081061087960032 0.0 0.02389139478379035 B : 0.023751938902471077 4.080991968757093 0.02389139478379035 C : 0.0 0.0 4.08113102 PeriodicSite: Ba2+ (0.0085, 0.0084, 0.0085) [0.0021, 0.0021, 0.0021] PeriodicSite: Ti4+ (2.1199, 2.1076, 2.1323) [0.5164, 0.5164, 0.5164] PeriodicSite: O2- (1.9893, 1.9777, 3.9972) [0.4846, 0.4846, 0.9738] PeriodicSite: O2- (2.0009, 3.9739, 2.0126) [0.4846, 0.9738, 0.4846] PeriodicSite: O2- (3.9855, 1.9777, 2.0126) [0.9738, 0.4846, 0.4846]
If you happen to type in a formula that doesn't have an exact match, the function will return an error message along with several possible suggestions
get_struct("BaTiO", df)
-------------------------------- IndexErrorTraceback (most recent call last) ~/PycharmProjects/mit_model_code/data/compound_featurizer.py in get_struct(compound_formula, df_input, struct_type) 61 try: ---> 62 struct_output = df_input.loc[df_input.Compound == compound_formula, struct_type].values[0] 63 # if the formula has no exact in the input dataframe, IndexError: index 0 is out of bounds for axis 0 with size 0 During handling of the above exception, another exception occurred: ExceptionTraceback (most recent call last) <ipython-input-6-e69cb0779f37> in <module> ----> 1 get_struct("BaTiO", df) ~/PycharmProjects/mit_model_code/data/compound_featurizer.py in get_struct(compound_formula, df_input, struct_type) 65 except IndexError: 66 raise Exception("The structure does not exist in this dataframe. The closest matches are {}.". ---> 67 format(difflib.get_close_matches(compound_formula, df_input.Compound))) 68 return struct_output 69 Exception: The structure does not exist in this dataframe. The closest matches are ['BaTiO3'].
BaTiO3 will be used consistently as the demo test structure from now on.
Now that we have the structure, we can use get_elem_distances() to calculate the distance between any two elements in the structure
But before doing that, we first need to know which site(s) each element occupies through the get_elem_info() function
elem_indices, _, modified_struct = get_elem_info(test_struct)
print(elem_indices, "\n")
print(modified_struct)
{Element Ba: [0, 1], Element Ti: [2, 3], Element O: [4, 5, 6, 7, 8, 9]} Full Formula (Ba2 Ti2 O6) Reduced Formula: BaTiO3 abc : 8.162262 4.081131 4.081131 angles: 89.664582 89.664582 89.664582 Sites (10) # SP a b c --- ---- -------- -------- -------- 0 Ba2+ 0.001035 0.002069 0.002069 1 Ba2+ 0.501035 0.002069 0.002069 2 Ti4+ 0.25822 0.51644 0.51644 3 Ti4+ 0.75822 0.51644 0.51644 4 O2- 0.24231 0.484619 0.973753 5 O2- 0.742309 0.484619 0.973753 6 O2- 0.24231 0.973753 0.484619 7 O2- 0.742309 0.973753 0.484619 8 O2- 0.486876 0.484619 0.484619 9 O2- 0.986876 0.484619 0.484619
If you compare this to the printout from the original, you will find that the modified structure have double the amount of sites
print(test_struct)
Full Formula (Ba1 Ti1 O3) Reduced Formula: BaTiO3 abc : 4.081131 4.081131 4.081131 angles: 89.664582 89.664582 89.664582 Sites (5) # SP a b c --- ---- -------- -------- -------- 0 Ba2+ 0.002069 0.002069 0.002069 1 Ti4+ 0.51644 0.51644 0.51644 2 O2- 0.484619 0.484619 0.973753 3 O2- 0.484619 0.973753 0.484619 4 O2- 0.973753 0.484619 0.484619
This is because if we keep the original function, Ba and Ti will only occupy one site
elem_indices_orig, *_ = get_elem_info(test_struct, makesupercell=False)
elem_indices_orig
{Element Ba: [0], Element Ti: [1], Element O: [2, 3, 4]}
The reason for returning a supercell of the original structure is related to the inner workings of get_elem_distances() function. It basically works by getting the site indices of the two elements (they can be the same) and using the built-in method of pymatgen.Structure.get_distance(i, j) to calculate the distance between site i and site j. There is one scenario where only using the original structure can cause a problem:
By making a supercell (in this case a'=2a, b'=b, c'=c), we would be able to get a non-zero distance betweem the original site and the newly translated site along the a-axis. That being said, if all elements in the original structure all occupy more than one site, the structure will not be modified.
Let's first try to calculate the Ba-Ba distance using the supercell structure
get_elem_distances(test_struct,
elem_1=mg.Element("Ba"),
elem_indices=elem_indices, only_unique=True)
array([3.45281586])
Note: when the only_unique
parameter is set to be True
, the function will only return the unique values of distance since in a structure the same distance can occur multiple times due to symmetry.
Let's see what happens when we use the original reduced structure
get_elem_distances(test_struct,
elem_1=mg.Element("Ba"),
elem_indices=elem_indices_orig, only_unique=True)
array([0])
As expected, we get 0 Å. We can also calculate the distance between different elements. Let's see the distance between Ti and O
get_elem_distances(test_struct,
elem_1=mg.Element("O"), elem_2=mg.Element("Ti"),
elem_indices=elem_indices_orig, only_unique=True)
array([1.87390777, 1.87390777, 1.87390777])
This function can also handle structures where multiple elements can occupy the same site (La$_{2.8}$Mg$_{1.2}$Mn$_4$O$_{12}$ is a made-up structure generated for the purpose of this demo)
special_struct = get_struct("La2.8Mg1.2Mn4O12", df)
print(special_struct)
Full Formula (La2.8 Mg1.2 Mn4 O12) Reduced Formula: La2.8Mg1.2Mn4O12 abc : 6.462070 7.719380 5.480370 angles: 90.000000 90.000000 90.000000 Sites (20) # SP a b c --- ------------------ ------ ------ ------ 0 La:0.700, Mg:0.300 0.0187 0.25 0.9952 1 La:0.700, Mg:0.300 0.9813 0.75 0.0048 2 La:0.700, Mg:0.300 0.4813 0.75 0.4952 3 La:0.700, Mg:0.300 0.5187 0.25 0.5048 4 Mn 0 0 0.5 5 Mn 0.5 0 0 6 Mn 0 0.5 0.5 7 Mn 0.5 0.5 0 8 O 0.491 0.25 0.0629 9 O 0.509 0.75 0.9371 10 O 0.009 0.75 0.5629 11 O 0.991 0.25 0.4371 12 O 0.2742 0.0334 0.7249 13 O 0.7258 0.9666 0.2751 14 O 0.2258 0.9666 0.2249 15 O 0.7742 0.0334 0.7751 16 O 0.7258 0.5334 0.2751 17 O 0.2742 0.4666 0.7249 18 O 0.7742 0.4666 0.7751 19 O 0.2258 0.5334 0.2249
elem_indices, *_ = get_elem_info(special_struct)
distances = get_elem_distances(special_struct,
elem_1=mg.Element("La"), elem_2=mg.Element("Mn"),
elem_indices=elem_indices, only_unique=True)
distances
array([3.33227319, 3.33227319, 3.33227319, 3.66036914, 3.66036914, 3.66036914])
It may seem that there are some distances that are equal to each other, but since the values displayed do not have all the decimal places shown, there are still slight differences among them.
distances[0] - distances[1]
-4.440892098500626e-16
calc_mm_dists(test_struct, return_unique=True)
{'Ti-Ti': array([4.08113102]), 'Ti-Ba': array([3.45281586, 3.45281586, 3.49445999]), 'Ba-Ba': array([4.08113102])}
calc_mx_dists(test_struct, return_unique=True)
{'Ti-O': array([1.87390777, 1.87390777, 1.87390777, 1.87390777, 1.87390777, 2.22393768, 2.22393768]), 'Ba-O': array([2.79465755, 2.79465755, 2.88146024, 2.88146024, 2.88146024])}
calc_xx_dists(test_struct, return_unique=True)
{'O-O': array([2.81480587, 2.81480587, 2.81480587, 2.81480587, 2.89490539, 2.89490539, 2.89490539, 2.89490539])}
This functionality is realized again through the get_elem_info() function where all the elements in the structure is classified as either a metal or a non_metal.
_, elem_groups, _ = get_elem_info(test_struct)
elem_groups
{'non_metals': [Element O], 'all_metals': [Element Ti, Element Ba], 'most_electro_neg_metal': Element Ti, 'other_metals': [Element Ba]}
Once we know which elements are metal and which ones are non_metal, we can then use the elem_indices to find where they are (i.e. the site indices) and compute the distances using the generic element distance finder get_elem_distances().
The calc_elem_max_potential() utilizes the EwaldSummation() module from Pymatgen to calculate site energy for all the sites in a structure and convert the site energy to site potential using the relation as follows. ($U_{E_\text{tot}}$: the total potential energy of the structure, $U_{E_i}$: the site energy at site i, $N$: the total number of sites, $q_i$: the charge at site i, $\Phi(r_i)$: the site potential at site i)
$$ \begin{align*} U_{E_\text{tot}}&=\sum_{i=1}^{N}U_{E_i}=\frac{1}{2}\sum_{i=1}^{N}q_i\Phi(r_i)\\ U_{E_i}&=\frac{1}{2}q_i\Phi(r_i)\\ \Phi(r_i)&=\frac{2U_{E_i}}{q_i} \end{align*} $$The default output unit for the Madelung site potential is in $V$
calc_elem_max_potential(test_struct, full_list=True)
{Element Ba: [-19.007087770378423], Element Ti: [-44.09209640742854], Element O: [23.05600190304703, 23.056001903047033, 23.05600190304704]}
But the unit can be converted from $V$ to $e/Å$ for easier comparison with the results from VESTA
calc_elem_max_potential(test_struct, full_list=True, check_vesta=True)
{Element Ba: [-1.3199687332941026], Element Ti: [-3.0620255636372096], Element O: [1.6011501601113243, 1.6011501601113245, 1.601150160111325]}
If we don't specify the full_list
parameter, it will be set to False
and the function only return the maximum site potential for each element.
calc_elem_max_potential(test_struct)
{Element Ba: -19.007087770378423, Element Ti: -44.09209640742854, Element O: 23.05600190304704}
Just like before, this function can also work with structures where multiple elements occupy the same site. We can try a compound with non-integer stoichiometry this time. (again, Mg$_2$AlFeO$_5$ is a made-up structure)
non_stoich_struct = get_struct("Mg2AlFeO5", df)
print(non_stoich_struct)
Full Formula (Mg8 Al4 Fe4 O20) Reduced Formula: Mg2AlFeO5 abc : 6.587000 14.600000 5.374000 angles: 90.000000 90.000000 90.000000 Sites (36) # SP a b c --- ---------------------- ------ ------ ------ 0 Mg2+ 0.0273 0.1087 0.492 1 Mg2+ 0.9727 0.8913 0.492 2 Mg2+ 0.9727 0.6087 0.492 3 Mg2+ 0.0273 0.3913 0.492 4 Mg2+ 0.5273 0.6087 0.992 5 Mg2+ 0.4727 0.3913 0.992 6 Mg2+ 0.4727 0.1087 0.992 7 Mg2+ 0.5273 0.8913 0.992 8 Al3+:0.760, Fe3+:0.240 0.9283 0.25 0.9533 9 Al3+:0.760, Fe3+:0.240 0.0717 0.75 0.9533 10 Al3+:0.760, Fe3+:0.240 0.4283 0.75 0.4533 11 Al3+:0.760, Fe3+:0.240 0.5717 0.25 0.4533 12 Al3+:0.240, Fe3+:0.760 0 0 0 13 Al3+:0.240, Fe3+:0.760 0 0.5 0 14 Al3+:0.240, Fe3+:0.760 0.5 0.5 0.5 15 Al3+:0.240, Fe3+:0.760 0.5 0 0.5 16 O2- 0.2523 0.9861 0.2491 17 O2- 0.7477 0.0139 0.2491 18 O2- 0.7477 0.4861 0.2491 19 O2- 0.2523 0.5139 0.2491 20 O2- 0.7523 0.4861 0.7491 21 O2- 0.2477 0.5139 0.7491 22 O2- 0.2477 0.9861 0.7491 23 O2- 0.7523 0.0139 0.7491 24 O2- 0.068 0.1493 0.0246 25 O2- 0.932 0.8507 0.0246 26 O2- 0.932 0.6493 0.0246 27 O2- 0.068 0.3507 0.0246 28 O2- 0.568 0.6493 0.5246 29 O2- 0.432 0.3507 0.5246 30 O2- 0.432 0.1493 0.5246 31 O2- 0.568 0.8507 0.5246 32 O2- 0.8607 0.25 0.6193 33 O2- 0.1393 0.75 0.6193 34 O2- 0.3607 0.75 0.1193 35 O2- 0.6393 0.25 0.1193
calc_elem_max_potential(non_stoich_struct, check_vesta=True)
{Element Mg: -1.3304538713264666, Element Fe: -2.264442240109135, Element Al: -2.264442240109135, Element O: 1.6742872520162175}
If you want to test the functions with structures that are not in the loaded dataframe, you can also upload your own .cif file to the user_defined
folder located at this path
./user_defined_structures/
USER_DEFINED_FOLDER_PATH = "./user_defined_structures/"
example_new_struct = mg.Structure.from_file(USER_DEFINED_FOLDER_PATH + "CuNiO2_mp-1178372_primitive.cif")
example_new_struct
Structure Summary Lattice abc : 3.09746735 3.09746735 5.69675328 angles : 64.90585729 115.09414271000001 124.91178089 volume : 39.36031605994916 A : 2.8051040966198997 0.0 -1.3136571057328008 B : -1.3422903888963815 2.463100790619447 1.3136571057328006 C : 0.0 0.0 5.69675328 PeriodicSite: Cu (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000] PeriodicSite: Ni (0.0000, 0.0000, 2.8484) [0.0000, 0.0000, 0.5000] PeriodicSite: O (1.0256, 1.0569, 1.3444) [0.5709, 0.4291, 0.2687] PeriodicSite: O (0.4373, 1.4062, 4.3524) [0.4291, 0.5709, 0.7313]
def get_elem_distances_wrapper(structure: mg.Structure, **kwargs):
"""A wrapper function around get_elem_distances() such that there is no need to get elem_indices manually"""
elem_indices, _, structure = get_elem_info(structure)
return get_elem_distances(structure, elem_indices=elem_indices, only_unique=True, **kwargs)
Check the Cu-Ni distance
get_elem_distances_wrapper(example_new_struct, elem_1=mg.Element("Cu"), elem_2=mg.Element("Ni"))
array([2.84837664, 3.19749481])
Check the Ni-O distance
get_elem_distances_wrapper(example_new_struct, elem_1=mg.Element("O"), elem_2=mg.Element("Ni"))
array([2.07845014, 2.07845014, 2.10492626, 2.10492626])
Check the Cu-Cu distance
get_elem_distances_wrapper(example_new_struct, elem_1=mg.Element("Cu"))
array([2.864732])
calc_mm_dists(example_new_struct)
{'Ni-Ni': array([2.864732]), 'Ni-Cu': array([2.84837664, 3.19749481, 3.19749481, 2.84837664]), 'Cu-Cu': array([2.864732])}
calc_mx_dists(example_new_struct)
{'Ni-O': array([2.10492626, 2.07845014, 2.07845014, 2.10492626, 2.07845014, 2.10492626, 2.10492626, 2.07845014]), 'Cu-O': array([1.99401095, 1.99401095, 1.99401095, 1.99401095, 1.99401095, 1.99401095, 1.99401095, 1.99401095])}
calc_xx_dists(example_new_struct)
{'O-O': array([2.864732 , 2.77446017, 2.81194507, 2.81194507, 2.77446017, 2.864732 ])}
To use the EwaldSummation technique, the input structure has to have oxidation states (that's where the charge value comes from) associated with all the sites. A structure without oxidation states will raise an error in the function.
calc_elem_max_potential(example_new_struct)
-------------------------------- AttributeErrorTraceback (most recent call last) <ipython-input-36-9a25972534a7> in <module> ----> 1 calc_elem_max_potential(example_new_struct) ~/PycharmProjects/mit_model_code/data/compound_featurizer.py in calc_elem_max_potential(structure_oxid, full_list, check_vesta) 589 590 # define a dictionary that stores the oxidation states for all the element --> 591 elem_charge_lookup = {specie.element: specie.oxi_state for specie in structure_oxid.composition.elements} 592 # if there is only one element, then ewald summation will not work 593 if len(elem_charge_lookup) == 1: ~/PycharmProjects/mit_model_code/data/compound_featurizer.py in <dictcomp>(.0) 589 590 # define a dictionary that stores the oxidation states for all the element --> 591 elem_charge_lookup = {specie.element: specie.oxi_state for specie in structure_oxid.composition.elements} 592 # if there is only one element, then ewald summation will not work 593 if len(elem_charge_lookup) == 1: ~/PycharmProjects/mit_model_code/venv/lib/python3.7/site-packages/pymatgen/core/periodic_table.py in __getattr__(self, item) 508 pass 509 return val --> 510 raise AttributeError("Element has no attribute %s!" % item) 511 512 @property AttributeError: Element has no attribute oxi_state!
To overcome this problem, we can add oxidation states to the structure using the add_oxidation_state_by_guess() method from Pymatgen
example_new_struct.add_oxidation_state_by_guess()
example_new_struct
Structure Summary Lattice abc : 3.09746735 3.09746735 5.69675328 angles : 64.90585729 115.09414271000001 124.91178089 volume : 39.36031605994916 A : 2.8051040966198997 0.0 -1.3136571057328008 B : -1.3422903888963815 2.463100790619447 1.3136571057328006 C : 0.0 0.0 5.69675328 PeriodicSite: Cu2+ (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000] PeriodicSite: Ni2+ (0.0000, 0.0000, 2.8484) [0.0000, 0.0000, 0.5000] PeriodicSite: O2- (1.0256, 1.0569, 1.3444) [0.5709, 0.4291, 0.2687] PeriodicSite: O2- (0.4373, 1.4062, 4.3524) [0.4291, 0.5709, 0.7313]
Now that we should be able to obtain proper results from the function.
calc_elem_max_potential(example_new_struct, check_vesta=True)
{Element Cu: -1.5818928513834425, Element Ni: -1.7340300140072384, Element O: 1.6414884929438913}