Source code for symmetr.magndata

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from six.moves import urllib
from lxml import html
from bs4 import BeautifulSoup
import pandas as pd
import re
from pymatgen.core import Structure, Lattice, Element

__all__ = ['download_id', 'get_magndata_structure']

def download_html(id):
    page = urllib.request.urlopen('https://cryst.ehu.es/magndata/index.php?index='+id)
    mybytes = page.read()
    page_txt = mybytes.decode("utf-8", errors='ignore')
    page.close()
    return page_txt

[docs] def get_magndata_structure(id): try: htmlstr = download_html(id) root = html.document_fromstring(htmlstr) dfs = pd.read_html(process_html(htmlstr)) except Exception as e: raise Exception("Error in downloading magndata page {}".format(e)) info = {} # Parse html contents lattice_parameter = parse_lattice_parameter(root) lattice = Lattice.from_parameters(*lattice_parameter) # Fetch tabled contents sites, tabled_info = parse_tables(dfs) species = sites['species'] coords = sites['coords'] moments = sites['moments'] occupancy = sites['occupancy'] constraints = sites['constraints'] labels = sites['labels'] properties = {'magmom': moments, 'magmom_basis': ['crystal_scaled'] * len(moments), 'occupancy': occupancy, 'label': labels, 'magnetic_constraints': constraints} try: structure = Structure(lattice, species, coords, site_properties=properties) except Exception as e: raise Exception("Error in extracting the structure from MAGNDATA: {}".format(e)) return structure
def parse_lattice_parameter(root): body_xpath = root.xpath('/html/body') body_lines = body_xpath[0].text_content().split('\n') for i, line in enumerate(body_lines): if re.search('Lattice parameters of the magnetic unit cell', line): lattice_line = body_lines[i + 1] break try: lstr = re.findall(r'([0-9]*\.[0-9]*)', lattice_line) lattice_parameter = [float(i) for i in lstr] except Exception as e: raise Exception("Problem in extracting lattice parameters from MAGNDATA: {}".format(e)) return lattice_parameter def parse_tables(dfs): # Extract atomic tables tables = [] for df in dfs: column = df.columns.map(str) if 'Label' in column: tables.append(df) if 'Atom' in column: # tables.append(df[~df['Atom'].str.contains('click')]) delete = [] for i, row in df.iterrows(): rowstr = row.Atom #if type(rowstr) is str or type(rowstr) is unicode: if type(rowstr) is str: if not rowstr.isdigit(): delete.append(i) tables.append(df.drop(index=delete)) # Remove duplication for i, table in enumerate(tables): column = table.columns.map(str) if '|M|' in column: start_index = i break for i in range(start_index + 1, len(tables)): column = tables[i].columns.map(str) if '|M|' in column: end_index = i break atom_tables = [tables[i] for i in range(start_index,end_index)] # Get infomations labels = [] atom_types = [] mag_atoms = [] multiplicities = [] coord_tables = [] dis_atoms = [] dis_labels = [] dis_occupancy = {} dis_coord = [] for table in atom_tables: column = table.columns.map(str) if 'Label' in column: labels.extend(table.loc[:,'Label']) atom_types.extend(table.loc[:, 'Atom type']) atom_types = strip_atom(atom_types) multiplicities.extend(table.loc[:,'Multiplicity']) if '|M|' in column: mag_atoms.extend(table.loc[:,'Label']) if 'Occupancy' in column: alloy = True for j, row in table.iterrows(): if row.Occupancy < 1: dis_atoms.append(row['Atom type']) dis_labels.append(row['Label']) dis_occupancy[row.Label] = row.Occupancy coord = [row.x,row.y,row.z] dis_coord.append(coord) else: alloy = False elif 'Atom' in column: coord_tables.append(table) else: print('Error: Failed to parse MAGNDATA tables.') exit(1) # Find the positional overlapping of disorder atoms lapped_pairs = [] if alloy: for i, label1 in enumerate(dis_labels): for j in range(i+1,len(dis_labels)): label2 = dis_labels[j] if dis_coord[i] == dis_coord[j]: lapped_pairs.append([label1,label2]) species = [] ext_labels = [] coords = [] moments = [] mag_constraints =[] occupancy = [] for i, table in enumerate(coord_tables): column = table.columns.map(str) for j, row in table.iterrows(): species.append(atom_types[i]) ext_labels.append(labels[i]) coord = [row.x,row.y,row.z] coords.append(convert_float(coord)) if labels[i] in mag_atoms: moment = [row.Mx,row.My,row.Mz] moments.append(convert_float(moment)) mag_constraints.append(row['Symmetry constraints on M']) else: moments.append([0.0] * 3) mag_constraints.append('0,0,0') if alloy: if labels[i] in dis_labels: try: occupancy.append(float(dis_occupancy[labels[i]])) except: print_exc() exit(3) else: occupancy.append(1.0) else: occupancy.append(1.0) elements = list(set(atom_types)) max_elem_num = max_element_number(elements) num = float(sum(occupancy)) if num.is_integer(): defect = False else: defect = True sites = { 'labels': ext_labels, 'species': species, 'coords': coords, 'moments': moments, 'occupancy': occupancy, 'constraints': mag_constraints } info = { 'num_of_atoms': num, 'overlapped_pairs': lapped_pairs, 'alloy': alloy, 'defect': defect, 'magnetic_atoms': mag_atoms, 'disorder_atoms': dis_atoms, 'elements': elements, 'max_element_number': max_elem_num, } return sites, info def process_html(htmlstr): # Merge the sepalate tables in MAGNDATA html (for pandas.read_html) # Decompose merged cells new = re.sub(r'<tr><td[^>]*colspan=[^>]*>.*?<\/td><\/tr>','',htmlstr) # Fix HTML soup = BeautifulSoup(new, "lxml") for body in soup("tbody"): body.unwrap() return str(soup) def convert_float(vec): try: newvec = [float(x) for x in vec] return newvec except Exception as e: raise Exception("Problem in convert_float {}".format(e)) def strip_atom(atoms): spec = [] for atom in atoms: a = re.search(r'[a-z|A-Z]+', atom).group() # Deuteriums are converted into simple hydrogen if a == 'D': a = 'H' spec.append(a) return spec def max_element_number(elements): elements_nums = [Element(e).number for e in elements] return max(elements_nums)