diff --git a/matminer/datasets/dataset_metadata.json b/matminer/datasets/dataset_metadata.json index 9f8d365e9..0ecedf795 100644 --- a/matminer/datasets/dataset_metadata.json +++ b/matminer/datasets/dataset_metadata.json @@ -605,6 +605,7 @@ }, "matbench_mp_is_metal": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David and Ceder, Gerbrand and Persson, Kristin a.},\ndoi = {10.1063/1.4812323},\nissn = {2166532X},\njournal = {APL Materials},\nnumber = {1},\npages = {011002},\ntitle = {{The Materials Project: A materials genome approach to accelerating materials innovation}},\nurl = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\nvolume = {1},\nyear = {2013}\n}" ], "columns": { @@ -620,6 +621,7 @@ }, "matbench_mp_gap": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David and Ceder, Gerbrand and Persson, Kristin a.},\ndoi = {10.1063/1.4812323},\nissn = {2166532X},\njournal = {APL Materials},\nnumber = {1},\npages = {011002},\ntitle = {{The Materials Project: A materials genome approach to accelerating materials innovation}},\nurl = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\nvolume = {1},\nyear = {2013}\n}" ], "columns": { @@ -635,6 +637,7 @@ }, "matbench_mp_e_form": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David and Ceder, Gerbrand and Persson, Kristin a.},\ndoi = {10.1063/1.4812323},\nissn = {2166532X},\njournal = {APL Materials},\nnumber = {1},\npages = {011002},\ntitle = {{The Materials Project: A materials genome approach to accelerating materials innovation}},\nurl = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\nvolume = {1},\nyear = {2013}\n}" ], "columns": { @@ -650,6 +653,7 @@ }, "matbench_log_gvrh": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Article{deJong2015,\nauthor={de Jong, Maarten and Chen, Wei and Angsten, Thomas\nand Jain, Anubhav and Notestine, Randy and Gamst, Anthony\nand Sluiter, Marcel and Krishna Ande, Chaitanya\nand van der Zwaag, Sybrand and Plata, Jose J. and Toher, Cormac\nand Curtarolo, Stefano and Ceder, Gerbrand and Persson, Kristin A.\nand Asta, Mark},\ntitle={Charting the complete elastic properties\nof inorganic crystalline compounds},\njournal={Scientific Data},\nyear={2015},\nmonth={Mar},\nday={17},\npublisher={The Author(s)},\nvolume={2},\npages={150009},\nnote={Data Descriptor},\nurl={http://dx.doi.org/10.1038/sdata.2015.9}\n}" ], "columns": { @@ -665,6 +669,7 @@ }, "matbench_log_kvrh": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Article{deJong2015,\nauthor={de Jong, Maarten and Chen, Wei and Angsten, Thomas\nand Jain, Anubhav and Notestine, Randy and Gamst, Anthony\nand Sluiter, Marcel and Krishna Ande, Chaitanya\nand van der Zwaag, Sybrand and Plata, Jose J. and Toher, Cormac\nand Curtarolo, Stefano and Ceder, Gerbrand and Persson, Kristin A.\nand Asta, Mark},\ntitle={Charting the complete elastic properties\nof inorganic crystalline compounds},\njournal={Scientific Data},\nyear={2015},\nmonth={Mar},\nday={17},\npublisher={The Author(s)},\nvolume={2},\npages={150009},\nnote={Data Descriptor},\nurl={http://dx.doi.org/10.1038/sdata.2015.9}\n}" ], "columns": { @@ -680,6 +685,7 @@ }, "matbench_dielectric": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{Jain2013,\nauthor = {Jain, Anubhav and Ong, Shyue Ping and Hautier, Geoffroy and Chen, Wei and Richards, William Davidson and Dacek, Stephen and Cholia, Shreyas and Gunter, Dan and Skinner, David and Ceder, Gerbrand and Persson, Kristin a.},\ndoi = {10.1063/1.4812323},\nissn = {2166532X},\njournal = {APL Materials},\nnumber = {1},\npages = {011002},\ntitle = {{The Materials Project: A materials genome approach to accelerating materials innovation}},\nurl = {http://link.aip.org/link/AMPADS/v1/i1/p011002/s1\\&Agg=doi},\nvolume = {1},\nyear = {2013}\n}", "@article{Petousis2017,\nauthor={Petousis, Ioannis and Mrdjenovich, David and Ballouz, Eric\nand Liu, Miao and Winston, Donald and Chen, Wei and Graf, Tanja\nand Schladt, Thomas D. and Persson, Kristin A. and Prinz, Fritz B.},\ntitle={High-throughput screening of inorganic compounds for the\ndiscovery of novel dielectric and optical materials},\njournal={Scientific Data},\nyear={2017},\nmonth={Jan},\nday={31},\npublisher={The Author(s)},\nvolume={4},\npages={160134},\nnote={Data Descriptor},\nurl={http://dx.doi.org/10.1038/sdata.2016.134}\n}" ], @@ -696,6 +702,7 @@ }, "matbench_jdft2d": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Article{Choudhary2017,\nauthor={Choudhary, Kamal\nand Kalish, Irina\nand Beams, Ryan\nand Tavazza, Francesca},\ntitle={High-throughput Identification and Characterization of Two-dimensional Materials using Density functional theory},\njournal={Scientific Reports},\nyear={2017},\nvolume={7},\nnumber={1},\npages={5179},\nabstract={We introduce a simple criterion to identify two-dimensional (2D) materials based on the comparison between experimental lattice constants and lattice constants mainly obtained from Materials-Project (MP) density functional theory (DFT) calculation repository. Specifically, if the relative difference between the two lattice constants for a specific material is greater than or equal to 5%, we predict them to be good candidates for 2D materials. We have predicted at least 1356 such 2D materials. For all the systems satisfying our criterion, we manually create single layer systems and calculate their energetics, structural, electronic, and elastic properties for both the bulk and the single layer cases. Currently the database consists of 1012 bulk and 430 single layer materials, of which 371 systems are common to bulk and single layer. The rest of calculations are underway. To validate our criterion, we calculated the exfoliation energy of the suggested layered materials, and we found that in 88.9% of the cases the currently accepted criterion for exfoliation was satisfied. Also, using molybdenum telluride as a test case, we performed X-ray diffraction and Raman scattering experiments to benchmark our calculations and understand their applicability and limitations. The data is publicly available at the website http://www.ctcms.nist.gov/{\textasciitilde}knc6/JVASP.html.},\nissn={2045-2322},\ndoi={10.1038/s41598-017-05402-0},\nurl={https://doi.org/10.1038/s41598-017-05402-0}\n}", "@misc{choudhary__2018, title={jdft_2d-7-7-2018.json}, url={https://figshare.com/articles/jdft_2d-7-7-2018_json/6815705/1}, DOI={10.6084/m9.figshare.6815705.v1}, abstractNote={2D materials}, publisher={figshare}, author={choudhary, kamal and https://orcid.org/0000-0001-9737-8074}, year={2018}, month={Jul}}" ], @@ -712,6 +719,7 @@ }, "matbench_perovskites": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Article{C2EE22341D,\nauthor =\"Castelli, Ivano E. and Landis, David D. and Thygesen, Kristian S. and Dahl, S\u00f8ren and Chorkendorff, Ib and Jaramillo, Thomas F. and Jacobsen, Karsten W.\",\ntitle =\"New cubic perovskites for one- and two-photon water splitting using the computational materials repository\",\njournal =\"Energy Environ. Sci.\",\nyear =\"2012\",\nvolume =\"5\",\nissue =\"10\",\npages =\"9034-9043\",\npublisher =\"The Royal Society of Chemistry\",\ndoi =\"10.1039/C2EE22341D\",\nurl =\"http://dx.doi.org/10.1039/C2EE22341D\",\nabstract =\"A new efficient photoelectrochemical cell (PEC) is one of the possible solutions to the energy and climate problems of our time. Such a device requires development of new semiconducting materials with tailored properties with respect to stability and light absorption. Here we perform computational screening of around 19\u2009000 oxides{,} oxynitrides{,} oxysulfides{,} oxyfluorides{,} and oxyfluoronitrides in the cubic perovskite structure with PEC applications in mind. We address three main applications: light absorbers for one- and two-photon water splitting and high-stability transparent shields to protect against corrosion. We end up with 20{,} 12{,} and 15 different combinations of oxides{,} oxynitrides and oxyfluorides{,} respectively{,} inviting further experimental investigation.\"}" ], "columns": { @@ -727,6 +735,7 @@ }, "matbench_glass": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Misc{LandoltBornstein1997:sm_lbs_978-3-540-47679-5_2,\nauthor=\"Kawazoe, Y.\nand Masumoto, T.\nand Tsai, A.-P.\nand Yu, J.-Z.\nand Aihara Jr., T.\",\neditor=\"Kawazoe, Y.\nand Yu, J.-Z.\nand Tsai, A.-P.\nand Masumoto, T.\",\ntitle=\"Nonequilibrium Phase Diagrams of Ternary Amorphous Alloys {\\textperiodcentered} 1 Introduction: Datasheet from Landolt-B{\\\"o}rnstein - Group III Condensed Matter {\\textperiodcentered} Volume 37A: ``Nonequilibrium Phase Diagrams of Ternary Amorphous Alloys'' in SpringerMaterials (https://dx.doi.org/10.1007/10510374{\\_}2)\",\npublisher=\"Springer-Verlag Berlin Heidelberg\",\nnote=\"Copyright 1997 Springer-Verlag Berlin Heidelberg\",\nnote=\"Part of SpringerMaterials\",\nnote=\"accessed 2018-10-23\",\ndoi=\"10.1007/10510374_2\",\nurl=\"https://materials.springer.com/lb/docs/sm_lbs_978-3-540-47679-5_2\"\n}", "@Article{Ward2016,\nauthor={Ward, Logan\nand Agrawal, Ankit\nand Choudhary, Alok\nand Wolverton, Christopher},\ntitle={A general-purpose machine learning framework for predicting properties of inorganic materials},\njournal={Npj Computational Materials},\nyear={2016},\nmonth={Aug},\nday={26},\npublisher={The Author(s)},\nvolume={2},\npages={16028},\nnote={Article},\nurl={http://dx.doi.org/10.1038/npjcompumats.2016.28}\n}" ], @@ -743,6 +752,7 @@ }, "matbench_expt_gap": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{doi:10.1021/acs.jpclett.8b00124,\nauthor = {Zhuo, Ya and Mansouri Tehrani, Aria and Brgoch, Jakoah},\ntitle = {Predicting the Band Gaps of Inorganic Solids by Machine Learning},\njournal = {The Journal of Physical Chemistry Letters},\nvolume = {9},\nnumber = {7},\npages = {1668-1673},\nyear = {2018},\ndoi = {10.1021/acs.jpclett.8b00124},\nnote ={PMID: 29532658},\neprint = {\nhttps://doi.org/10.1021/acs.jpclett.8b00124\n\n}}" ], "columns": { @@ -758,6 +768,7 @@ }, "matbench_expt_is_metal": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@article{doi:10.1021/acs.jpclett.8b00124,\nauthor = {Zhuo, Ya and Mansouri Tehrani, Aria and Brgoch, Jakoah},\ntitle = {Predicting the Band Gaps of Inorganic Solids by Machine Learning},\njournal = {The Journal of Physical Chemistry Letters},\nvolume = {9},\nnumber = {7},\npages = {1668-1673},\nyear = {2018},\ndoi = {10.1021/acs.jpclett.8b00124},\nnote ={PMID: 29532658},\neprint = {\nhttps://doi.org/10.1021/acs.jpclett.8b00124\n\n}}" ], "columns": { @@ -773,6 +784,7 @@ }, "matbench_phonons": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@Article{Petretto2018,\nauthor={Petretto, Guido\nand Dwaraknath, Shyam\nand P.C. Miranda, Henrique\nand Winston, Donald\nand Giantomassi, Matteo\nand van Setten, Michiel J.\nand Gonze, Xavier\nand Persson, Kristin A.\nand Hautier, Geoffroy\nand Rignanese, Gian-Marco},\ntitle={High-throughput density-functional perturbation theory phonons for inorganic materials},\njournal={Scientific Data},\nyear={2018},\nmonth={May},\nday={01},\npublisher={The Author(s)},\nvolume={5},\npages={180065},\nnote={Data Descriptor},\nurl={http://dx.doi.org/10.1038/sdata.2018.65}\n}", "@misc{petretto_dwaraknath_miranda_winston_giantomassi_rignanese_van setten_gonze_persson_hautier_2018, title={High-throughput Density-Functional Perturbation Theory phonons for inorganic materials}, url={https://figshare.com/collections/High-throughput_Density-Functional_Perturbation_Theory_phonons_for_inorganic_materials/3938023/1}, DOI={10.6084/m9.figshare.c.3938023.v1}, abstractNote={The knowledge of the vibrational properties of a material is of key importance to understand physical phenomena such as thermal conductivity, superconductivity, and ferroelectricity among others. However, detailed experimental phonon spectra are available only for a limited number of materials which hinders the large-scale analysis of vibrational properties and their derived quantities. In this work, we perform ab initio calculations of the full phonon dispersion and vibrational density of states for 1521 semiconductor compounds in the harmonic approximation based on density functional perturbation theory. The data is collected along with derived dielectric and thermodynamic properties. We present the procedure used to obtain the results, the details of the provided database and a validation based on the comparison with experimental data.}, publisher={figshare}, author={Petretto, Guido and Dwaraknath, Shyam and Miranda, Henrique P. C. and Winston, Donald and Giantomassi, Matteo and Rignanese, Gian-Marco and Van Setten, Michiel J. and Gonze, Xavier and Persson, Kristin A and Hautier, Geoffroy}, year={2018}, month={Apr}}" ], @@ -789,6 +801,7 @@ }, "matbench_steels": { "bibtex_refs": [ + "@Article{Dunn2020,\nauthor={Dunn, Alexander\nand Wang, Qi\nand Ganose, Alex\nand Dopp, Daniel\nand Jain, Anubhav},\ntitle={Benchmarking materials property prediction methods: the Matbench test set and Automatminer reference algorithm},\njournal={npj Computational Materials},\nyear={2020},\nmonth={Sep},\nday={15},\nvolume={6},\nnumber={1},\npages={138},\nabstract={We present a benchmark test suite and an automated machine learning procedure for evaluating supervised machine learning (ML) models for predicting properties of inorganic bulk materials. The test suite, Matbench, is a set of 13{\\thinspace}ML tasks that range in size from 312 to 132k samples and contain data from 10 density functional theory-derived and experimental sources. Tasks include predicting optical, thermal, electronic, thermodynamic, tensile, and elastic properties given a material's composition and/or crystal structure. The reference algorithm, Automatminer, is a highly-extensible, fully automated ML pipeline for predicting materials properties from materials primitives (such as composition and crystal structure) without user intervention or hyperparameter tuning. We test Automatminer on the Matbench test suite and compare its predictive power with state-of-the-art crystal graph neural networks and a traditional descriptor-based Random Forest model. We find Automatminer achieves the best performance on 8 of 13 tasks in the benchmark. We also show our test suite is capable of exposing predictive advantages of each algorithm---namely, that crystal graph methods appear to outperform traditional machine learning methods given {\\textasciitilde}104 or greater data points. We encourage evaluating materials ML algorithms on the Matbench benchmark and comparing them against the latest version of Automatminer.},\nissn={2057-3960},\ndoi={10.1038/s41524-020-00406-3},\nurl={https://doi.org/10.1038/s41524-020-00406-3}\n}\n", "@misc{Citrine Informatics,\ntitle = {Mechanical properties of some steels},\nhowpublished = {\\url{https://citrination.com/datasets/153092/},\n}" ], "columns": { diff --git a/matminer/featurizers/composition.py b/matminer/featurizers/composition.py index 0aa95c04a..b7e7afdc3 100644 --- a/matminer/featurizers/composition.py +++ b/matminer/featurizers/composition.py @@ -16,6 +16,7 @@ from matminer.featurizers.base import BaseFeaturizer from matminer.featurizers.utils.stats import PropertyStats +from matminer.featurizers.utils.oxidation import has_oxidation_states from matminer.utils.data import DemlData, MagpieData, PymatgenData, \ CohesiveEnergyData, MixingEnthalpy, MatscholarElementData, MEGNetElementData @@ -25,24 +26,6 @@ module_dir = os.path.dirname(os.path.abspath(__file__)) data_dir = os.path.join(module_dir, "..", "utils", "data_files") - -# Utility operations -def has_oxidation_states(comp): - """Check if a composition object has oxidation states for each element - - TODO: Does this make sense to add to pymatgen? -wardlt - - Args: - comp (Composition): Composition to check - Returns: - (boolean) Whether this composition object contains oxidation states - """ - for el in comp.elements: - if not hasattr(el, "oxi_state") or el.oxi_state is None: - return False - return True - - def is_ionic(comp): """Determines whether a compound is an ionic compound. @@ -1186,6 +1169,7 @@ def citations(self): "url = {http://linkinghub.elsevier.com/retrieve/pii/S0927025614007113}, " "volume = {97}, year = {2015} } "] + class Miedema(BaseFeaturizer): """ Formation enthalpies of intermetallic compounds, from Miedema et al. diff --git a/matminer/featurizers/site.py b/matminer/featurizers/site.py index d2e0f570e..4f459f085 100644 --- a/matminer/featurizers/site.py +++ b/matminer/featurizers/site.py @@ -2488,6 +2488,27 @@ def __init__( self.soap = None self.n_elements = None + + @classmethod + def from_preset(cls, preset): + """ + Create a SOAP featurizer object from sensible or published presets. + + Args: + preset (str): Choose from: + "formation energy": Preset used for formation energy prediction + in the original Dscribe paper. + + Returns: + + """ + valid_presets = ["formation_energy"] + if preset == "formation_energy": + return cls(6, 8, 8, 0.4, True, "gto", True) + else: + raise ValueError(f"'{preset}' is not a valid preset. Choose from {valid_presets}") + + def _check_fitted(self): if not self.soap: raise NotFittedError("Please fit SOAP before featurizing.") diff --git a/matminer/featurizers/structure.py b/matminer/featurizers/structure.py index 2064b9382..e23004af7 100644 --- a/matminer/featurizers/structure.py +++ b/matminer/featurizers/structure.py @@ -33,10 +33,11 @@ from matminer.featurizers.base import BaseFeaturizer from matminer.featurizers.site import OPSiteFingerprint, \ CoordinationNumber, LocalPropertyDifference, CrystalNNFingerprint, \ - AverageBondAngle, AverageBondLength + AverageBondAngle, AverageBondLength, SOAP from matminer.featurizers.utils.stats import PropertyStats from matminer.featurizers.utils.cgcnn import appropriate_kwargs, \ CrystalGraphConvNetWrapper, CIFDataWrapper +from matminer.featurizers.utils.oxidation import has_oxidation_states from matminer.utils.caching import get_all_nearest_neighbors from matminer.utils.data import IUCrBondValenceData @@ -133,21 +134,30 @@ class GlobalSymmetryFeatures(BaseFeaturizer): - Spacegroup number - Crystal system (1 of 7) - Centrosymmetry (has inversion symmetry) + - Number of symmetry ops, obtained from the spacegroup """ - crystal_idx = {"triclinic": 7, - "monoclinic": 6, - "orthorhombic": 5, - "tetragonal": 4, - "trigonal": 3, - "hexagonal": 2, - "cubic": 1 - } + crystal_idx = { + "triclinic": 7, + "monoclinic": 6, + "orthorhombic": 5, + "tetragonal": 4, + "trigonal": 3, + "hexagonal": 2, + "cubic": 1 + } + + all_features = [ + "spacegroup_num", + "crystal_system", + "crystal_system_int", + "is_centrosymmetric", + "n_symmetry_ops" + ] def __init__(self, desired_features=None): - self.features = ["spacegroup_num", "crystal_system", - "crystal_system_int", "is_centrosymmetric"] if not \ - desired_features else desired_features + self.features = \ + desired_features if desired_features else self.all_features def featurize(self, s): sga = SpacegroupAnalyzer(s) @@ -166,19 +176,19 @@ def featurize(self, s): if "is_centrosymmetric" in self.features: output.append(sga.is_laue()) + if "n_symmetry_ops" in self.features: + output.append(len(sga.get_symmetry_operations())) + return output def feature_labels(self): - all_features = ["spacegroup_num", "crystal_system", - "crystal_system_int", - "is_centrosymmetric"] # enforce order - return [x for x in all_features if x in self.features] + return [x for x in self.all_features if x in self.features] def citations(self): return [] def implementors(self): - return ["Anubhav Jain"] + return ["Anubhav Jain", "Alex Dunn"] class Dimensionality(BaseFeaturizer): @@ -221,16 +231,35 @@ class RadialDistributionFunction(BaseFeaturizer): Calculate the radial distribution function (RDF) of a crystal structure. Features: - - Radial distribution function + - Radial distribution function. Each feature is the "density" of the + distribution at a certain radius. Args: - cutoff: (float) distance up to which to calculate the RDF. - bin_size: (float) size of each bin of the (discrete) RDF. + cutoff: (float) Angstrom distance up to which to calculate the RDF. + bin_size: (float) size in Angstrom of each bin of the (discrete) RDF. + + Attributes: + bin_distances (np.Ndarray): The distances each bin represents. Can be + used for graphing the RDF. """ def __init__(self, cutoff=20.0, bin_size=0.1): self.cutoff = cutoff self.bin_size = bin_size + self.bin_distances = np.arange(0, cutoff, bin_size) + + def precheck(self, s): + """ + Precheck the structure is ordered. + + Args: + s: (pymatgen.Struture) + + Returns: + (bool): True if passing precheck, false if failing + + """ + return s.is_ordered def featurize(self, s): """ @@ -239,7 +268,7 @@ def featurize(self, s): s (Structure): Pymatgen Structure object. Returns: - rdf, dist: (tuple of arrays) the first element is the + rdf: (iterable) the first element is the normalized RDF, whereas the second element is the inner radius of the RDF bin. """ @@ -261,16 +290,18 @@ def featurize(self, s): dist_bins[1:], 3) - np.power(dist_bins[:-1], 3)) number_density = s.num_sites / s.volume rdf = dist_hist / shell_vol / number_density - return [{'distances': dist_bins[:-1], 'distribution': rdf}] + return rdf def feature_labels(self): - return ["radial distribution function"] + bin_labels = get_rdf_bin_labels(self.bin_distances, self.cutoff) + bin_labels = [f"rdf {bl}A" for bl in bin_labels] + return bin_labels def citations(self): return [] def implementors(self): - return ["Saurabh Bajaj"] + return ["Saurabh Bajaj", "Alex Dunn"] class PartialRadialDistributionFunction(BaseFeaturizer): @@ -282,19 +313,21 @@ class PartialRadialDistributionFunction(BaseFeaturizer): descriptor by [Schutt *et al.*] (https://journals.aps.org/prb/abstract/10.1103/PhysRevB.89.205118) + Features: + Each feature corresponds to the density of number of bonds + for a certain pair of elements at a certain range of + distances. For example, "Al-Al PRDF r=1.00-1.50" corresponds + to the density of Al-Al bonds between 1 and 1.5 distance units + By default, this featurizer generates RDFs for each pair + of elements in the training set. + Args: cutoff: (float) distance up to which to calculate the RDF. bin_size: (float) size of each bin of the (discrete) RDF. include_elems: (list of string), list of elements that must be included in PRDF exclude_elems: (list of string), list of elmeents that should not be included in PRDF - Features: - Each feature corresponds to the density of number of bonds - for a certain pair of elements at a certain range of - distances. For example, "Al-Al PRDF r=1.00-1.50" corresponds - to the density of Al-Al bonds between 1 and 1.5 distance units - By default, this featurizer generates RDFs for each pair - of elements in the training set.""" + """ def __init__(self, cutoff=20.0, bin_size=0.1, include_elems=(), exclude_elems=()): @@ -305,6 +338,19 @@ def __init__(self, cutoff=20.0, bin_size=0.1, include_elems=(), include_elems) # Makes sure the element lists are ordered self.exclude_elems = list(exclude_elems) + def precheck(self, s): + """ + Precheck the structure is ordered. + + Args: + s: (pymatgen.Struture) + + Returns: + (bool): True if passing precheck, false if failing + + """ + return s.is_ordered + def fit(self, X, y=None): """Define the list of elements to be included in the PRDF. By default, the PRDF will include all of the elements in `X` @@ -352,8 +398,7 @@ def featurize(self, s): if self.elements_ is None: raise Exception("You must run 'fit' first!") - dist_bins, prdf = self.compute_prdf( - s) # Assemble the PRDF for each pair + dist_bins, prdf = self.compute_prdf(s) # Assemble the PRDF for each pair # Convert the PRDF into a feature array zeros = np.zeros_like(dist_bins) # Zeros if elements don't appear @@ -466,16 +511,35 @@ class ElectronicRadialDistributionFunction(BaseFeaturizer): from atomic partial charges. Atomic charges are obtained from the ValenceIonicRadiusEvaluator class. + WARNING: The ReDF needs oxidation states to work correctly. + Args: cutoff: (float) distance up to which the ReDF is to be - calculated (default: longest diagaonal in - primitive cell). + calculated dr: (float) width of bins ("x"-axis) of ReDF (default: 0.05 A). + + Attributes: + distances (np.ndarray): The distances at which each bin begins. """ - def __init__(self, cutoff=None, dr=0.05): + def __init__(self, cutoff=20, dr=0.05): self.cutoff = cutoff self.dr = dr + self.nbins = int(self.cutoff / self.dr) + 1 + self.distances = np.array([i * self.dr for i in range(self.nbins)]) + + def precheck(self, s) -> bool: + """ + Check the structure to ensure the ReDF can be run. + + Args: + s (pymatgen. Structure): Structure to precheck + + Returns: + (bool) + + """ + return has_oxidation_states(s.composition) and s.is_ordered def featurize(self, s): """ @@ -490,6 +554,11 @@ def featurize(self, s): 'distances'; the ReDF itself is accessible via key 'redf'. """ + + if not has_oxidation_states(s.composition): + raise ValueError("Structure must have oxidation states") + if not s.is_ordered: + raise ValueError("Structure must be ordered") if self.dr <= 0: raise ValueError("width of bins for ReDF must be >0") @@ -498,20 +567,7 @@ def featurize(self, s): # Add oxidation states. struct = ValenceIonicRadiusEvaluator(struct).structure - - if self.cutoff is None: - # Set cutoff to longest diagonal. - a = struct.lattice.matrix[0] - b = struct.lattice.matrix[1] - c = struct.lattice.matrix[2] - self.cutoff = max( - [np.linalg.norm(a + b + c), np.linalg.norm(-a + b + c), - np.linalg.norm(a - b + c), np.linalg.norm(a + b - c)]) - - nbins = int(self.cutoff / self.dr) + 1 - redf_dict = {"distances": np.array( - [(i + 0.5) * self.dr for i in range(nbins)]), - "distribution": np.zeros(nbins, dtype=np.float)} + distribution = np.zeros(self.nbins, dtype=np.float) for site in struct.sites: this_charge = float(site.specie.oxi_state) @@ -519,13 +575,15 @@ def featurize(self, s): for nnsite, dist, *_ in neighbors: neigh_charge = float(nnsite.specie.oxi_state) bin_index = int(dist / self.dr) - redf_dict["distribution"][bin_index] \ + distribution[bin_index] \ += (this_charge * neigh_charge) / (struct.num_sites * dist) - return [redf_dict] + return distribution def feature_labels(self): - return ["electronic radial distribution function"] + bin_labels = get_rdf_bin_labels(self.distances, self.cutoff) + bin_labels = [f"ReDF {bl}A" for bl in bin_labels] + return bin_labels def citations(self): return ["@article{title={Method for the computational comparison" @@ -1015,21 +1073,84 @@ class MinimumRelativeDistances(BaseFeaturizer): """ Determines the relative distance of each site to its closest neighbor. + We use the relative distance, f_ij = r_ij / (r^atom_i + r^atom_j), as a measure rather than the absolute distances, r_ij, to account for the fact that different atoms/species have different sizes. The function uses the valence-ionic radius estimator implemented in Pymatgen. + + The features can be flattened so a uniform-length vector is returned for + each material, regardless of the number of sites in each structure. + Returning flat output REQUIRES fitting (using self.fit(...)). If fit, + structures having fewer sites than the max sites among the fitting + structures are extended with NaNs; structures with more sites are truncated. + + To return non-flat (i.e., requiring further processing) features so that + no features are NaN and no distances are truncated, use flatten=False. + + Features: + + If using flatten=True: + + site #{number} min. rel. dist. (float): The minimum relative distance of + site {number} + site #{number} specie (str): The string representing the specie at site + {number} + site #{number} neighbor specie(s) (str, tuple(str)): The neighbor specie + used to determine the minimum relative distance with respect to site + {number}. If multiple neighbor sites have equivalent minimum + relative distances,all these sites are listed in a tuple. + + + If using flatten=False: + + minimum relative distance of each site ([float]): List of the minimum + relative distance for each site. Structures with different numbers + of sites will return a different length vector. + Args: - cutoff: (float) (absolute) distance up to which tentative - closest neighbors (on the basis of relative distances) - are to be determined. + cutoff (float): (absolute) distance up to which tentative closest + neighbors (on the basis of relative distances) are to be determined. + flatten (bool): If True, returns a uniform length feature vector for + each structure regardless of the number of sites in the structure. + If True, you must call .fit() before featurizing. + include_distances (bool): Include the numerical minimum relative + distance in the returned features. Only used if flatten=True. + include_species (bool): Include the species for each site and the + species of the neighbor (as determined by minimum rel. distance). + Only used as flatten=True. """ - def __init__(self, cutoff=10.0): + def __init__(self, cutoff=10.0, flatten=True, include_distances=True, + include_species=True): + + if not include_distances and not include_species: + raise ValueError( + "Featurizer must return distances, species, or both." + ) + + self.include_distances = include_distances + self.include_species = include_species self.cutoff = cutoff + self.flatten = flatten + self._max_sites = None + + def fit(self, X, y=None): + """ + Fit the MRD featurizer to a list of structures. + + Args: + X ([Structure]): A list of pymatgen structures. + y : unused (added for consistency with overridden method signature) - def featurize(self, s, cutoff=10.0): + Returns: + self + """ + self._max_sites = max([len(s.sites) for s in X]) + return self + + def featurize(self, s): """ Get minimum relative distances of all sites of the input structure. @@ -1040,21 +1161,80 @@ def featurize(self, s, cutoff=10.0): dists_relative_min: (list of floats) list of all minimum relative distances (i.e., for all sites). """ + + + self._check_fitted() + vire = ValenceIonicRadiusEvaluator(s) - dists_relative_min = [] - for site in vire.structure: + n_sites = len(s.sites) + parent_site_species = [None] * n_sites + neighbor_site_species = [None] * n_sites + dists_relative_min = [None] * n_sites + + for i, site in enumerate(vire.structure): dists_relative = [] - for nnsite, dist, *_ in vire.structure.get_neighbors(site, self.cutoff): + neigh_species_relative = [] + for nnsite, dist, *_ in vire.structure.get_neighbors( + site, self.cutoff + ): r_site = vire.radii[site.species_string] r_neigh = vire.radii[nnsite.species_string] radii_dist = r_site + r_neigh d_relative = dist / radii_dist dists_relative.append(d_relative) - dists_relative_min.append(min(dists_relative)) - return [dists_relative_min] + neigh_species_relative.append(site.species_string) + + dists_relative = np.asarray(dists_relative) + drmin = dists_relative.min() + dists_relative_min[i] = drmin + dists_relative_min_ix = np.where(dists_relative == drmin) + + neigh_species_equiv = \ + np.asarray(neigh_species_relative)[dists_relative_min_ix] + + parent_site_species[i] = site.species_string + + if len(neigh_species_equiv) == 1: + neighbor_site_species[i] = neigh_species_equiv[0] + else: + neighbor_site_species[i] = tuple(neigh_species_equiv) + + if self.flatten: + features = [] + + for i in range(self._max_sites): + site_features = [] + if i <= n_sites - 1: + if self.include_distances: + site_features.append(dists_relative_min[i]) + if self.include_species: + site_features.append(parent_site_species[i]) + site_features.append(neighbor_site_species[i]) + else: + site_features = [np.nan] * (int(self.include_distances) + + 2 * int(self.include_species)) + features += site_features + return features + + else: + return [dists_relative_min] def feature_labels(self): - return ["minimum relative distance of each site"] + self._check_fitted() + + if self.flatten: + labels = [] + for i in range(self._max_sites): + site_labels = [] + if self.include_distances: + site_labels.append(f"site #{i} min. rel. dist.") + if self.include_species: + site_labels.append(f"site #{i} specie") + site_labels.append(f"site #{i} neighbor specie(s)") + labels += site_labels + return labels + else: + return ["minimum relative distance of each site"] def citations(self): return ["@article{Zimmermann2017," @@ -1074,6 +1254,13 @@ def citations(self): def implementors(self): return ["Nils E. R. Zimmermann", "Alex Dunn"] + def _check_fitted(self): + if not self._max_sites and self.flatten: + raise NotFittedError( + "If using flatten=True, MinimumRelativeDistances must be fit " + "before using." + ) + class SiteStatsFingerprint(BaseFeaturizer): """ @@ -1122,6 +1309,27 @@ def __init__(self, site_featurizer, stats=('mean', 'std_dev'), min_oxi=None, def _site_labels(self): return self.site_featurizer.feature_labels() + def fit(self, X, y=None, **fit_kwargs): + """ + Fit the SiteStatsFeaturizer using the fitting function of the underlying + site featurizer. Only applicable if the site featurizer is fittable. + + See the ".fit()" method of the site_featurizer used to construct the + class for more information. + + Args: + X (Iterable): + y (optional, Iterable): + **fit_kwargs: Keyword arguments used by the fit function of the + site featurizer class. + + Returns: + self (SiteStatsFeaturizer) + + """ + self.site_featurizer.fit(X, y, **fit_kwargs) + return self + def featurize(self, s): # Get each feature for each site vals = [[] for t in self._site_labels] @@ -1191,8 +1399,12 @@ def from_preset(preset, **kwargs): preset (str) - Name of preset kwargs - Options for SiteStatsFingerprint """ - - if preset == "CrystalNNFingerprint_cn": + if preset == "SOAP_formation_energy": + return SiteStatsFingerprint( + SOAP.from_preset("formation_energy"), + **kwargs + ) + elif preset == "CrystalNNFingerprint_cn": return SiteStatsFingerprint( CrystalNNFingerprint.from_preset("cn", cation_anion=False), **kwargs) @@ -3828,3 +4040,26 @@ def citations(self): "doi = {10.1180/minmag.2013.077.3.05}," "url = {https://doi.org/10.1180/minmag.2013.077.3.05}}", ] + + +def get_rdf_bin_labels(bin_distances, cutoff): + """ + Common function for getting bin labels given the distances at which each + bin begins and the ending cutoff. + + + Args: + bin_distances (np.ndarray): The distances at which each bin begins. + cutoff (float): The final cutoff value. + + Returns: + [str]: The feature labels for the *RDF + + """ + bin_dists_complete = np.concatenate((bin_distances, np.asarray([cutoff]))) + flabels = [""] * len(bin_distances) + for i, _ in enumerate(bin_distances): + lower = "{:.5f}".format(bin_dists_complete[i]) + higher = "{:.5f}".format(bin_dists_complete[i + 1]) + flabels[i] = f"[{lower} - {higher}]" + return flabels \ No newline at end of file diff --git a/matminer/featurizers/tests/test_structure.py b/matminer/featurizers/tests/test_structure.py index 0468798e6..35d8bd179 100644 --- a/matminer/featurizers/tests/test_structure.py +++ b/matminer/featurizers/tests/test_structure.py @@ -13,7 +13,7 @@ from multiprocessing import set_start_method from sklearn.exceptions import NotFittedError -from pymatgen import Structure, Lattice, Molecule +from pymatgen import Structure, Lattice, Molecule, Specie from pymatgen.util.testing import PymatgenTest from matminer.featurizers.composition import ElementProperty @@ -27,7 +27,7 @@ MaximumPackingEfficiency, ChemicalOrdering, StructureComposition, \ Dimensionality, XRDPowderPattern, CGCNNFeaturizer, JarvisCFID, \ GlobalInstabilityIndex, \ - StructuralComplexity + StructuralComplexity, get_rdf_bin_labels # For the CGCNNFeaturizer try: @@ -78,6 +78,15 @@ def setUp(self): coords_are_cartesian=False) self.bond_angles = range(5, 180, 5) + + diamond_copy = copy.deepcopy(self.diamond) + diamond_copy.replace_species( + {Specie("C", 0.0): + {Specie("C", 0.0): 0.99, Specie("Si", 0.0): 0.01} + } + ) + self.disordered_diamond = diamond_copy + def test_density_features(self): df = DensityFeatures() f = df.featurize(self.diamond) @@ -99,7 +108,8 @@ def test_density_features(self): def test_global_symmetry(self): gsf = GlobalSymmetryFeatures() - self.assertEqual(gsf.featurize(self.diamond), [227, "cubic", 1, True]) + self.assertEqual(gsf.featurize(self.diamond), + [227, "cubic", 1, True, 48]) def test_dimensionality(self): cscl = PymatgenTest.get_structure("CsCl") @@ -112,59 +122,62 @@ def test_dimensionality(self): def test_rdf_and_peaks(self): ## Test diamond - rdforig = RadialDistributionFunction().featurize( - self.diamond) - rdf = rdforig[0] + rdf = RadialDistributionFunction() + diamond_rdf = rdf.featurize(self.diamond) + + # Prechecking test + self.assertTrue(rdf.precheck(self.diamond)) + self.assertFalse(rdf.precheck(self.disordered_diamond)) # Make sure it the last bin is cutoff-bin_max - self.assertAlmostEqual(max(rdf['distances']), 19.9) + self.assertAlmostEqual(max(rdf.bin_distances), 19.9) # Verify bin sizes - self.assertEqual(len(rdf['distribution']), 200) + self.assertEqual(len(diamond_rdf), 200) # Make sure it gets all of the peaks - self.assertEqual(np.count_nonzero(rdf['distribution']), 116) + self.assertEqual(np.count_nonzero(diamond_rdf), 116) # Check the values for a few individual peaks - self.assertAlmostEqual( - rdf['distribution'][int(round(1.5 / 0.1))], 15.12755155) - self.assertAlmostEqual( - rdf['distribution'][int(round(2.9 / 0.1))], 12.53193948) - self.assertAlmostEqual( - rdf['distribution'][int(round(19.9 / 0.1))], 0.822126129) + self.assertAlmostEqual(diamond_rdf[int(round(1.5 / 0.1))], 15.12755155) + self.assertAlmostEqual(diamond_rdf[int(round(2.9 / 0.1))], 12.53193948) + self.assertAlmostEqual(diamond_rdf[int(round(19.9 / 0.1))], 0.822126129) + + # Check the feature labels make sense + self.assertEqual(rdf.feature_labels()[0], "rdf [0.00000 - 0.10000]A") + self.assertEqual(rdf.feature_labels()[9], "rdf [0.90000 - 1.00000]A") # Repeat test with NaCl (omitting comments). Altering cutoff distance - rdforig = RadialDistributionFunction(cutoff=10).featurize(self.nacl) - rdf = rdforig[0] - self.assertAlmostEqual(max(rdf['distances']), 9.9) - self.assertEqual(len(rdf['distribution']), 100) - self.assertEqual(np.count_nonzero(rdf['distribution']), 11) - self.assertAlmostEqual( - rdf['distribution'][int(round(2.8 / 0.1))], 27.09214168) - self.assertAlmostEqual( - rdf['distribution'][int(round(4.0 / 0.1))], 26.83338723) - self.assertAlmostEqual( - rdf['distribution'][int(round(9.8 / 0.1))], 3.024406467) + rdf2 = RadialDistributionFunction(cutoff=10) + nacl_rdf = rdf2.featurize(self.nacl) + self.assertAlmostEqual(max(rdf2.bin_distances), 9.9) + self.assertEqual(len(nacl_rdf), 100) + self.assertEqual(np.count_nonzero(nacl_rdf), 11) + self.assertAlmostEqual(nacl_rdf[int(round(2.8 / 0.1))], 27.09214168) + self.assertAlmostEqual(nacl_rdf[int(round(4.0 / 0.1))], 26.83338723) + self.assertAlmostEqual(nacl_rdf[int(round(9.8 / 0.1))], 3.024406467) # Repeat test with CsCl. Altering cutoff distance and bin_size - rdforig = RadialDistributionFunction( - cutoff=8, bin_size=0.5).featurize(self.cscl) - rdf = rdforig[0] - self.assertAlmostEqual(max(rdf['distances']), 7.5) - self.assertEqual(len(rdf['distribution']), 16) - self.assertEqual(np.count_nonzero(rdf['distribution']), 5) - self.assertAlmostEqual( - rdf['distribution'][int(round(3.5 / 0.5))], 6.741265585) - self.assertAlmostEqual( - rdf['distribution'][int(round(4.0 / 0.5))], 3.937582548) - self.assertAlmostEqual( - rdf['distribution'][int(round(7.0 / 0.5))], 1.805505363) + rdf3 = RadialDistributionFunction(cutoff=8, bin_size=0.5) + cscl_rdf = rdf3.featurize(self.cscl) + self.assertAlmostEqual(max(rdf3.bin_distances), 7.5) + self.assertEqual(len(cscl_rdf), 16) + self.assertEqual(np.count_nonzero(cscl_rdf), 5) + self.assertAlmostEqual(cscl_rdf[int(round(3.5 / 0.5))], 6.741265585) + self.assertAlmostEqual(cscl_rdf[int(round(4.0 / 0.5))], 3.937582548) + self.assertAlmostEqual(cscl_rdf[int(round(7.0 / 0.5))], 1.805505363) def test_prdf(self): # Test a few peaks in diamond # These expected numbers were derived by performing # the calculation in another code distances, prdf = PartialRadialDistributionFunction().compute_prdf(self.diamond) + + # Check prechecking + prdf_obj = PartialRadialDistributionFunction() + self.assertTrue(prdf_obj.precheck(self.diamond)) + self.assertFalse(prdf_obj.precheck(self.disordered_diamond)) + self.assertEqual(len(prdf.values()), 1) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.4 / 0.1))], 0) self.assertAlmostEqual(prdf[('C', 'C')][int(round(1.5 / 0.1))], 1.32445167622) @@ -237,30 +250,40 @@ def test_prdf(self): [prdf[('Al', 'Al')], prdf[('Al', 'Ni')], prdf[('Ni', 'Ni')]])) def test_redf(self): - d = ElectronicRadialDistributionFunction().featurize( - self.diamond)[0] - self.assertAlmostEqual(int(1000 * d["distances"][0]), 25) - self.assertAlmostEqual(int(1000 * d["distribution"][0]), 0) - self.assertAlmostEqual(int(1000 * d["distances"][len( - d["distances"]) - 1]), 6175) - self.assertAlmostEqual(int(1000 * d["distribution"][len( - d["distances"]) - 1]), 0) - d = ElectronicRadialDistributionFunction().featurize( - self.nacl)[0] - self.assertAlmostEqual(int(1000 * d["distances"][0]), 25) - self.assertAlmostEqual(int(1000 * d["distribution"][0]), 0) - self.assertAlmostEqual(int(1000 * d["distances"][56]), 2825) - self.assertAlmostEqual(int(1000 * d["distribution"][56]), -2108) - self.assertAlmostEqual(int(1000 * d["distances"][len( - d["distances"]) - 1]), 9875) - d = ElectronicRadialDistributionFunction().featurize( - self.cscl)[0] - self.assertAlmostEqual(int(1000 * d["distances"][0]), 25) - self.assertAlmostEqual(int(1000 * d["distribution"][0]), 0) - self.assertAlmostEqual(int(1000 * d["distances"][72]), 3625) - self.assertAlmostEqual(int(1000 * d["distribution"][72]), -2194) - self.assertAlmostEqual(int(1000 * d["distances"][len( - d["distances"]) - 1]), 7275) + + # Test prechecking + erdf = ElectronicRadialDistributionFunction(cutoff=10, dr=0.05) + self.assertTrue(erdf.precheck(self.diamond)) + self.assertFalse(erdf.precheck(self.disordered_diamond)) + self.assertFalse(erdf.precheck(self.diamond_no_oxi)) + + # C has oxi state of 0 in diamond, so we expect them all to be 0 + d = erdf.featurize(self.diamond) + self.assertAlmostEqual(erdf.distances[0], 0) + self.assertAlmostEqual(erdf.distances[1], 0.05) + self.assertFalse(np.asarray(d).any()) + + d = erdf.featurize(self.nacl) + self.assertAlmostEqual(erdf.distances[0], 0) + self.assertAlmostEqual(erdf.distances[1], 0.05) + self.assertTrue(np.asarray(d).any()) + self.assertAlmostEqual(d[-4], 0.81151636) + self.assertAlmostEqual(d[-13], -2.54280359) + self.assertAlmostEqual(d[56], -2.10838136) + + d = erdf.featurize(self.cscl) + self.assertAlmostEqual(erdf.distances[0], 0) + self.assertAlmostEqual(erdf.distances[1], 0.05) + self.assertAlmostEqual(d[72], -2.19472661) + self.assertAlmostEqual(d[-13], 2.55004188) + + def test_get_rdf_bin_labels(self): + bin_distances = [1, 2, 3, 4, 5] + cutoff = 6 + flabels = get_rdf_bin_labels(bin_distances, cutoff) + self.assertEqual(flabels[0], "[1.00000 - 2.00000]") + self.assertEqual(flabels[2], "[3.00000 - 4.00000]") + self.assertEqual(flabels[-1], "[5.00000 - 6.00000]") def test_coulomb_matrix(self): # flat @@ -368,13 +391,65 @@ def test_orbital_field_matrix(self): self.assertAlmostEqual(ofm_vector[ix], 1.4789015345821415) def test_min_relative_distances(self): - self.assertAlmostEqual(MinimumRelativeDistances().featurize( + + with self.assertRaises(ValueError): + MinimumRelativeDistances(include_species=False, + include_distances=False) + + mrd_nonuniform = MinimumRelativeDistances(flatten=False) + self.assertAlmostEqual(mrd_nonuniform.featurize( self.diamond_no_oxi)[0][0], 1.1052576) - self.assertAlmostEqual(MinimumRelativeDistances().featurize( + self.assertAlmostEqual(mrd_nonuniform.featurize( self.nacl)[0][0], 0.8891443) - self.assertAlmostEqual(MinimumRelativeDistances().featurize( + self.assertAlmostEqual(mrd_nonuniform.featurize( self.cscl)[0][0], 0.9877540) + mrd_flat = MinimumRelativeDistances(flatten=True) + + with self.assertRaises(NotFittedError): + mrd_flat.featurize(self.diamond) + + # Fit on a structure with 2 sites: + mrd_flat.fit([self.diamond_no_oxi]) + + # Ensure it can featurize the structure it was fit on + f_diamond = mrd_flat.featurize(self.diamond_no_oxi) + self.assertAlmostEqual(f_diamond[0], 1.1052576) + self.assertEqual(f_diamond[1], "C") + self.assertEqual(f_diamond[2], "C") + self.assertAlmostEqual(f_diamond[3], 1.1052576) + self.assertEqual(f_diamond[4], "C") + self.assertEqual(f_diamond[5], "C") + self.assertEqual(len(f_diamond), 6) + + # Ensure it can featurize a different structure w/ same n_sites (2) + f_cscl = mrd_flat.featurize(self.cscl) + self.assertAlmostEqual(f_cscl[0], 0.9877540) + self.assertEqual(f_cscl[1], "Cl-") + self.assertEqual(f_cscl[2][0], "Cl-") + self.assertEqual(len(f_cscl[2]), 4) + self.assertEqual(len(f_cscl), 6) + + + # Ensure it truncates extra sites on structure w/ more n_sites + f_ni3al = mrd_flat.featurize(self.ni3al) + self.assertAlmostEqual(f_ni3al[0], 0.95731379) + self.assertEqual(f_ni3al[1], "Al") + self.assertEqual(f_ni3al[2][0], "Al") + self.assertEqual(len(f_ni3al[2]), 12) + self.assertEqual(len(f_ni3al), 6) + self.assertAlmostEqual(f_ni3al[3], 0.921857729) + + # Ensure it extends extra sites on structure with fewer n_sites + f_sc = mrd_flat.featurize(self.sc) + self.assertAlmostEqual(f_sc[0], 1.408) + self.assertEqual(f_sc[1], "Al") + self.assertEqual(f_sc[2][0], "Al") + self.assertEqual(len(f_sc[2]), 6) + self.assertEqual(len(f_sc), 6) + self.assertTrue(f_sc[3], np.nan) + + def test_sitestatsfingerprint(self): # Test matrix. op_struct_fp = SiteStatsFingerprint.from_preset("OPSiteFingerprint", @@ -393,7 +468,7 @@ def test_sitestatsfingerprint(self): # Test stats. op_struct_fp = SiteStatsFingerprint.from_preset("OPSiteFingerprint") opvals = op_struct_fp.featurize(self.diamond) - print(opvals, '**') + # print(opvals, '**') self.assertAlmostEqual(opvals[0], 0.0005, places=7) self.assertAlmostEqual(opvals[1], 0, places=7) self.assertAlmostEqual(opvals[2], 0.0005, places=7) @@ -431,6 +506,15 @@ def test_sitestatsfingerprint(self): features = prop_fp.featurize(self.nacl) self.assertArrayAlmostEqual([14, 29.22138464, 37.38969216], features) + # Test soap site featurizer + soap_fp = SiteStatsFingerprint.from_preset("SOAP_formation_energy") + soap_fp.fit([self.sc, self.diamond, self.nacl]) + feats = soap_fp.featurize(self.diamond) + self.assertEqual(len(feats), 6480) + self.assertAlmostEqual(feats[0], 0.4412608, places=5) + self.assertAlmostEqual(feats[1], 0.0) + self.assertAlmostEqual(np.sum(feats), 207.88194724, places=5) + def test_ewald(self): # Add oxidation states to all of the structures for s in [self.nacl, self.cscl, self.diamond]: diff --git a/matminer/featurizers/utils/oxidation.py b/matminer/featurizers/utils/oxidation.py new file mode 100644 index 000000000..5702c8cab --- /dev/null +++ b/matminer/featurizers/utils/oxidation.py @@ -0,0 +1,15 @@ +# Utility operations +def has_oxidation_states(comp): + """Check if a composition object has oxidation states for each element + + TODO: Does this make sense to add to pymatgen? -wardlt + + Args: + comp (Composition): Composition to check + Returns: + (boolean) Whether this composition object contains oxidation states + """ + for el in comp.elements: + if not hasattr(el, "oxi_state") or el.oxi_state is None: + return False + return True \ No newline at end of file