Warm tip: This article is reproduced from stackoverflow.com, please click
google-apps-script google-drive-api pdf

Google-script; Conversion from the format application / octet-stream to application / pdf is not sup

发布于 2020-04-22 11:03:31

I have a similar error to ERROR Get pdf-attachments from Gmail as text but it doesn't work in my case. Drive API's enable. Document has the extension .pdf. This script works for other documents (also pdf). Read pdf as text is from Get pdf-attachments from Gmail as text

Please help.

This is my code:

function searchEmails() {

var threads = GmailApp.search('in:inbox newer_than:6d');

if (threads.length > 0) {
  for (var t=threads.length-1; t>=0; t--) {

    var thread = threads[t];
    var message = thread.getMessages()[0];
    var from = message.getFrom();
    var subject = message.getSubject();
    var to = message.getTo();
    var date = message.getDate();
    var body = message.getBody();
    var attachments = message.getAttachments();

    if (subject == 'subject') {
        Messages(message)
      }
    }
  }
}

function Messages(message) {

  var attachments = message.getAttachments();
  var blob = attachments[0].getAs(MimeType.PDF);
  var body = message.getBody();
  var filetext = pdfToText(blob);     
  filetext = filetext.substr(filetext.search("Title:"));
  filetext = filetext.split(' ');

  var msgValue = filetext[12];
  var msgDate = filetext[6];
  var msgID = message.getId(); 


// rest of my code

function pdfToText (blob, options) {

  options = options || {};
  var parents = [];
  if (options.path) {
    parents.push(getDriveFolderFromPath(options.path));
  }
  var pdfName = blob.getName();
  var resource = {
    title: pdfName,
    mimeType: blob.getContentType(),
    parents: parents
  };

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'pl'
  }
  var gdocFile = Drive.Files.insert(resource, blob, insertOpts);

  // Get text from GDOC  
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // Delete document.
  if (!options.keepGdoc) {
    Drive.Files.remove(gdocFile.id);
  }

  return text;
}

function getDriveFolderFromPath (path) {
  return (path || "/").split("/").reduce ( function(prev,current) {
    if (prev && current) {
      var fldrs = prev.getFoldersByName(current);
      return fldrs.hasNext() ? fldrs.next() : null;
    }
    else { 
      return current ? null : prev; 
    }
  },DriveApp.getRootFolder()); 
}
Questioner
vipmaciej
Viewed
135
vipmaciej 2019-10-11 14:29

First of all my first code works. However, if there is more than one attachment in email, you will have problems I had originally. Solution below:

function searchEmails() {

  var threads = GmailApp.search('in:inbox newer_than:6d');

    if (threads.length > 0) {
      for (var t=threads.length-1; t>=0; t--) {
        var msgs = GmailApp.getMessagesForThreads(threads);    

        for (var i = 0 ; i < msgs.length; i++) {    
          for (var j = 0; j < msgs[i].length; j++) {      
            var attachments = msgs[i][j].getAttachments();                

            for (var k = 0; k < attachments.length; k++) {          
              var content = attachments[k].getContentType();                
              Logger.log(attachments[k].getName()) //check file extension

            if (content == 'application/pdf'){
               

/* 
you can check if the attachment has the expected name
              var attachmentsName = attachments[k].getName();
              if (attachmentsName == 'looking name'){
*/

              var blob = attachments[k].getAs(MimeType.PDF);            
              var filetext = pdfToText(blob);            
              filetext = filetext.substr(filetext.search("SZCZEGÓŁY"));            
              filetext = filetext.split(' ');                        
              var msgValue = filetext[14] + filetext[15];            
              var msgDate = filetext[6];            
              var type = filetext[3];            

   // rest of my code

function pdfToText (blob, options) {

  options = options || {};
  var parents = [];
  if (options.path) {
    parents.push(getDriveFolderFromPath(options.path));
  }
  var pdfName = blob.getName();
  var resource = {
    title: pdfName,
    mimeType: blob.getContentType(),
    parents: parents
  };

  // Save PDF as GDOC
  resource.title = pdfName.replace(/pdf$/, 'gdoc');
  var insertOpts = {
    ocr: true,
    ocrLanguage: options.ocrLanguage || 'pl'
  }
  var gdocFile = Drive.Files.insert(resource, blob, insertOpts);

  // Get text from GDOC  
  var gdocDoc = DocumentApp.openById(gdocFile.id);
  var text = gdocDoc.getBody().getText();

  // Delete document.
  if (!options.keepGdoc) {
    Drive.Files.remove(gdocFile.id);
  }

  return text;
}

function getDriveFolderFromPath (path) {
  return (path || "/").split("/").reduce ( function(prev,current) {
    if (prev && current) {
      var fldrs = prev.getFoldersByName(current);
      return fldrs.hasNext() ? fldrs.next() : null;
    }
    else { 
      return current ? null : prev; 
    }
  },DriveApp.getRootFolder()); 
}