Source code for symmetr.magndata

# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.

from six.moves import urllib
from lxml import html
from bs4 import BeautifulSoup
import pandas as pd
import re
from pymatgen.core import Structure, Lattice, Element

__all__ = ['download_id', 'get_magndata_structure']

def download_html(id):
    page = urllib.request.urlopen('https://cryst.ehu.es/magndata/index.php?index='+id)
    mybytes = page.read()
    page_txt = mybytes.decode("utf-8", errors='ignore')
    page.close()
    return page_txt


[docs]
def get_magndata_structure(id):
    try:
        htmlstr = download_html(id)
        root = html.document_fromstring(htmlstr)
        dfs = pd.read_html(process_html(htmlstr))

    except Exception as e:
        raise Exception("Error in downloading magndata page {}".format(e))

    info = {}
    # Parse html contents
    lattice_parameter = parse_lattice_parameter(root)
    lattice = Lattice.from_parameters(*lattice_parameter)

    # Fetch tabled contents
    sites, tabled_info = parse_tables(dfs)
    species = sites['species']
    coords = sites['coords']
    moments = sites['moments']
    occupancy = sites['occupancy']
    constraints = sites['constraints']
    labels = sites['labels']

    properties = {'magmom': moments, 'magmom_basis': ['crystal_scaled'] * len(moments), 'occupancy': occupancy,
                  'label': labels, 'magnetic_constraints': constraints}
    try:
        structure = Structure(lattice, species, coords, site_properties=properties)
    except Exception as e:
        raise Exception("Error in extracting the structure from MAGNDATA: {}".format(e))
    return structure



def parse_lattice_parameter(root):
    body_xpath = root.xpath('/html/body')
    body_lines = body_xpath[0].text_content().split('\n')

    for i, line in enumerate(body_lines):
        if re.search('Lattice parameters of the magnetic unit cell', line):
            lattice_line = body_lines[i + 1]
            break

    try:
        lstr = re.findall(r'([0-9]*\.[0-9]*)', lattice_line)
        lattice_parameter = [float(i) for i in lstr]
    except Exception as e:
       raise Exception("Problem in extracting lattice parameters from MAGNDATA: {}".format(e))

    return lattice_parameter

def parse_tables(dfs):
    # Extract atomic tables
    tables = []
    for df in dfs:
        column = df.columns.map(str)
        if 'Label' in column:
            tables.append(df)
        if 'Atom' in column:
            # tables.append(df[~df['Atom'].str.contains('click')])
            delete = []
            for i, row in df.iterrows():
                rowstr = row.Atom
                #if type(rowstr) is str or type(rowstr) is unicode:
                if type(rowstr) is str:
                    if not rowstr.isdigit():
                        delete.append(i)
            tables.append(df.drop(index=delete))

    # Remove duplication
    for i, table in enumerate(tables):
        column = table.columns.map(str)
        if '|M|' in column:
            start_index = i
            break
    for i in range(start_index + 1, len(tables)):
        column = tables[i].columns.map(str)
        if '|M|' in column:
            end_index = i
            break
    atom_tables = [tables[i] for i in range(start_index,end_index)]

    # Get infomations
    labels = []
    atom_types = []
    mag_atoms = []
    multiplicities = []
    coord_tables = []
    dis_atoms = []
    dis_labels = []
    dis_occupancy = {}
    dis_coord = []
    for table in atom_tables:
        column = table.columns.map(str)
        if 'Label' in column:
            labels.extend(table.loc[:,'Label'])
            atom_types.extend(table.loc[:, 'Atom type'])
            atom_types = strip_atom(atom_types)
            multiplicities.extend(table.loc[:,'Multiplicity'])
            if '|M|' in column:
                mag_atoms.extend(table.loc[:,'Label'])
            if 'Occupancy' in column:
                alloy = True
                for j, row in table.iterrows():
                    if row.Occupancy < 1:
                        dis_atoms.append(row['Atom type'])
                        dis_labels.append(row['Label'])
                        dis_occupancy[row.Label] = row.Occupancy
                        coord = [row.x,row.y,row.z]
                        dis_coord.append(coord)
            else:
                alloy = False
        elif 'Atom' in column:
            coord_tables.append(table)
        else:
            print('Error: Failed to parse MAGNDATA tables.')
            exit(1)

    # Find the positional overlapping of disorder atoms
    lapped_pairs = []
    if alloy:
        for i, label1 in enumerate(dis_labels):
            for j in range(i+1,len(dis_labels)):
                label2 = dis_labels[j]
                if dis_coord[i] == dis_coord[j]:
                    lapped_pairs.append([label1,label2])

    species = []
    ext_labels = []
    coords = []
    moments = []
    mag_constraints =[]
    occupancy = []
    for i, table in enumerate(coord_tables):
        column = table.columns.map(str)

        for j, row in table.iterrows():
            species.append(atom_types[i])
            ext_labels.append(labels[i])
            coord = [row.x,row.y,row.z]
            coords.append(convert_float(coord))
            if labels[i] in mag_atoms:
                moment = [row.Mx,row.My,row.Mz]
                moments.append(convert_float(moment))
                mag_constraints.append(row['Symmetry constraints on M'])
            else:
                moments.append([0.0] * 3)
                mag_constraints.append('0,0,0')
            if alloy:
                if labels[i] in dis_labels:
                    try:
                        occupancy.append(float(dis_occupancy[labels[i]]))
                    except:
                        print_exc()
                        exit(3)
                else:
                    occupancy.append(1.0)
            else:
                occupancy.append(1.0)

    elements = list(set(atom_types))
    max_elem_num = max_element_number(elements)

    num = float(sum(occupancy))
    if num.is_integer():
        defect = False
    else:
        defect = True

    sites = {
        'labels': ext_labels,
        'species': species,
        'coords': coords,
        'moments': moments,
        'occupancy': occupancy,
        'constraints': mag_constraints
    }

    info = {
        'num_of_atoms': num,
        'overlapped_pairs': lapped_pairs,
        'alloy': alloy,
        'defect': defect,
        'magnetic_atoms': mag_atoms,
        'disorder_atoms': dis_atoms,
        'elements': elements,
        'max_element_number': max_elem_num,
        }

    return sites, info

def process_html(htmlstr):
    # Merge the sepalate tables in MAGNDATA html (for pandas.read_html)

    # Decompose merged cells
    new = re.sub(r'<tr><td[^>]*colspan=[^>]*>.*?<\/td><\/tr>','',htmlstr)

    # Fix HTML
    soup = BeautifulSoup(new, "lxml")
    for body in soup("tbody"):
        body.unwrap()

    return str(soup)

def convert_float(vec):
    try:
        newvec = [float(x) for x in vec]
        return newvec
    except Exception as e:
        raise Exception("Problem in convert_float {}".format(e))

def strip_atom(atoms):
    spec = []
    for atom in atoms:
        a = re.search(r'[a-z|A-Z]+', atom).group()
        # Deuteriums are converted into simple hydrogen
        if a == 'D':
            a = 'H'
        spec.append(a)
    return spec

def max_element_number(elements):
    elements_nums = [Element(e).number for e in elements]
    return max(elements_nums)