Python脚本:将mol2分子库文件拆分为单个mol2文件

如题:将mol2分子库文件拆分为单个mol2文件

用法:

python split_multimol2.py multi-mol2.mol2 out_dir
注释:python 脚本文件 mol2分子库 输出目录

split_multimol2.py:

#Python2 or Python3
#AspirinCode 2018
#Script that splits a multi-mol2 file into individual mol2 files.
#python split_multimol2.py multi-mol2.mol2 out_dir

import sys
import os


def split_multimol2(multimol2):
    """
    Splits a multi-mol2 file.

    Parameters
    ----------
    multimol2 : str
      Path to the multi-mol2 file.

    Returns
    ----------
    A generator object for lists for every extracted mol2-file. Lists contain
      the molecule ID and the mol2 file contents.
      e.g., ['ID1234', '@<TRIPOS>MOLECULE...'

    """
    with open(multimol2, 'r') as mol2file:
        line = mol2file.readline()

        while not mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
            if line.startswith("@<TRIPOS>MOLECULE"):
                mol2cont = []
                mol2cont.append(line)
                line = mol2file.readline()
                molecule_id = line.strip()

                while not line.startswith("@<TRIPOS>MOLECULE"):
                    mol2cont.append(line)
                    line = mol2file.readline()
                    if mol2file.tell() == os.fstat(mol2file.fileno()).st_size:
                        mol2cont.append(line)
                        break
                mol2cont[-1] = mol2cont[-1].rstrip() # removes blank line at file end
                yield [molecule_id, "".join(mol2cont)]


def write_multimol2(multimol2, out_dir):
    """
    Splits a multi-mol2 file into smaller multi-mol2 files.

    Parameters
    -----------
    multimol2 : str
      Path to the multi-mol2 file.

    out_dir : str:
      Output directory. New files will be named
      <molecule_name_1>.mol2, ... <molecule_name_n>.mol2

    Returns
    -----------
    chunks : int
      Number of files written.

    """
    if not out_dir:
        os.mkdir(out_dir)

    single_mol2s = split_multimol2(args.MOL2_FILE)
    for mol2 in single_mol2s:
        out_mol2 = os.path.join(args.OUT_DIR, mol2[0]) + '.mol2'
        with open(out_mol2, 'w') as out_file:
            for line in mol2[1]:
                out_file.write(line)
            out_file.write('\n')


def write_multimol2_chunks(multimol2, chunk_size, out_dir):
    """
    Splits a multi-mol2 file into smaller multi-mol2 files.

    Parameters
    -----------
    multimol2 : str
      Path to the multi-mol2 file.

    chunksize : int
      Number of mol2 files per chunk.

    out_dir : str:
      Output directory. New files will be named
      <multimol2>_1.mol2, ... <multimol2>_n.mol2

    Returns
    -----------
    chunks : int
      Number of files written.

    """
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    out_path_stem = os.path.dirname(multimol2)
    out_file_stem = os.path.basename(multimol2).split('.mol2')[0]

    cnt = 0
    chunks = 1
    out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
    for mol2 in split_multimol2(multimol2):
        cnt += 1
        if cnt == chunk_size:
            cnt = 0
            chunks += 1
            out_file.close()
            out_file = open(os.path.join(out_dir, out_file_stem)+'_%d.mol2' % chunks, 'w')
        out_file.write(mol2[1] + '\n')
    out_file.close()
    return chunks


if __name__ == '__main__':

    import argparse

    parser = argparse.ArgumentParser(
        description='Splits a multi-mol2 file into individual mol2 files',
        formatter_class=argparse.RawTextHelpFormatter
        )

    parser.add_argument('MOL2_FILE')
    parser.add_argument('OUT_DIR')
    parser.add_argument('-c', '--chunksize', help='Number of MOL2 structures per file (1 by default)', type=int)
    parser.add_argument('-v', '--version', action='version', version='split_multimol2 v. 1.1')

    args = parser.parse_args()


    if args.chunksize:
        write_multimol2_chunks(multimol2=args.MOL2_FILE, chunk_size=args.chunksize, out_dir=args.OUT_DIR)

    else:
        write_multimol2(multimol2=args.MOL2_FILE, out_dir=args.OUT_DIR)

猜你喜欢

转载自blog.csdn.net/u012325865/article/details/81008489