#! /usr/bin/env python
# -*- coding: utf-8 -*-
#
# checori.py
#
# version 0.2 2007-11-30
#
# Copyright © 2007 Jan Beyer <jan@beathovn.de>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# On Debian systems, the complete text of the GNU General
# Public License can be found in `/usr/share/common-licenses/GPL'.
#
# This program searches through all files in the current directory (and recursively
# all subdirectories) for the occurence of copyright information.
# It extracts this and tries to fit this into the proposed new debian/copyright
# format, see http://wiki.debian.org/Proposals/CopyrightFormat.
# YOU STILL HAVE TO MANUALLY EDIT THE OUTPUT OF THIS PROGRAM TO GET A VALID 
# debian/copyright FILE!!! It may, however, be helpful in gathering the
# needed information.
#
#

import os
import sys
import re

def reduce_path(p):
  """Strip the path to the directory.
    
  Takes a filename as an argument and strips our starting directory away.
  The stripped filename is returned.
  """
  	
  if p.startswith(basedir):
    return p.replace(basedir+'/','')
  else:
    return p

def check_copyright(file):
  """Check the given file for copyright information.
  
  The given file is search for the occurence of the regex "checkfor". Its 
  filename,   two copyright-lines (as one string) and two paragraphs of license
  information (as one string) are returned.
  If the license is either GPL or LGPL, no license information is returned, as
  you are supposed to have the standard pointer to '/usr/share/common-licenses'
  in your debian/copyright file.
  If the copyright matches the main copyright of your package (regex 
  "maincopyright") then three empty strings are returned, as we don't need to
  list these files.
  """
  
  f=open(file,'r')
  fn = ''
  line = f.readline().strip('/*# ')
  while not len(line) == 0:
    if (checkfor.match(line) != None) and (maincopyright.match(line.strip()) == None):
      fn = reduce_path(file)
      copylic = [line.strip()]
      for i in range(maxlinesaftercopyright):
        copylic.append(f.readline().strip('/*# ').strip()) # stripping usual commentary signs and newlines.
      # The following you probably want to check for your package.
      # My upstreams copyright information always starts with 'Copyright (C)',
      # so I strip this and add © instead.
      # A second line is added, because there are often the email addresses.
      cr = '© '+copylic[0].lstrip('Copyright (C)').strip()+copylic[1].strip()
      i = 2
      lic = ''
      for j in range(i,maxlinesaftercopyright): # look for (L)GPL info
        if (copylic[j].find('GNU General') != -1):
          lic = 'GPL'
        if (copylic[j].find('GNU Lesser General') != -1) or (copylic[j].find('GNU Library General') != -1):
          lic = 'LGPL'
        if (copylic[j].lower().find('version 2') != -1):
          lic += '-2'
        if (copylic[j].find('any later version') != -1):
          lic += '+'
        if (copylic[j].lower().find('public domain') != -1):
          lic = 'PD'
      if lic == '':
        lic = 'other'
      lic += '\n'
      # The following I do not like, but it works, mostly...
      # I'm just appending two paragraphs, but max "maxlinesaftercopyright"
      # to lic to have an idea of the real license already there. If the
      # license is not GPL and not LGPL. These should be covered by the standard
      # reference to '/usr/share/common-licenses'.
      # For funny licenses, one really has to go to the source files and check
      # carefully and add the correct info to debian/copyright manually.
      paragraphs = 0
      while (i <= maxlinesaftercopyright-1) and (paragraphs < 2) and (not lic.startswith('GPL')) and (not lic.startswith('LGPL')):
        lic += ' '+copylic[i]+'\n'
        i += 1
        if len(copylic[i]) == 0:
          paragraphs += 1
      line = f.readline().strip('/*# ')
    else:
      line = f.readline().strip('/*# ')
  f.close()
  if fn != '':
    return fn, cr, lic
  else:
    return '', '', ''

def checkdir(dire):
  """Check a directory (recursively) for copyright information in its files.

  For all the files in the directory, check_copyright(filename) is called. The
  returned string triplet is added to three lists: files, copyrights, licenses.
  If the copyright and license matches an already existing entry in the latter
  two lists, only the filename is added to the corresponding files[i], otherwise
  the complete triplet is added to the three lists.
  """
  
  filelist = os.listdir(dire)
  filelist.sort()

  for item in filelist:
    if os.path.isfile(os.path.join(dire,item)):
      fn, cr, lic = check_copyright(os.path.join(dire,item))
      if (fn != ''):
        done = 0
        for i in range(len(copyrights)):
          if (cr == copyrights[i]) and (lic == licenses[i]):
            files[i] += ', '+fn
            done = 1
          else:
            done = 0
        if not done:
          files.append(fn)
          copyrights.append(cr)
          licenses.append(lic)
    if os.path.isdir(os.path.join(dire,item)):
      checkdir(os.path.join(dire,item))
  

if __name__=="__main__":
  """Compile copyright info from a source tree.
  
  Call this script in the following way
  $ checori > copyright-skeleton
  
  This script looks for copyright information in all files in a source tree,
  starting from the current directory. It outputs filenames, their copyright
  holders and license information in a format resembling the proposal for
  machine-interpretable debian/copyright files on
  http://wiki.debian.org/Proposals/CopyrightFormat in the version from Nov 2007.
  
  IT DOES NOT PRODUCE VALID debian/copyright FILES!!!
  """
  
  print('This file is NO VALID debian/copyright FILE!!!')

  # the files are checked for "checkfor". Starting from there, all the rest of
  # the copyright and license information is looked for.
  checkfor = re.compile('Copyright')

  # "maincopyright" holds the copyright of the package as a whole. This has to
  # be added manually to your final file in an appropriate way. All
  # files containing this information will NOT be reported by this script!!!
  # This means also files, which have a differing license, than the main license
  # but the same copyright line! BEWARE!
  maincopyright = re.compile('Copyright \(C\) [0-9, -]* David Necas \(Yeti\), Petr Klapetek.')
  
  # "maxlinesaftercopyright" is the maximum number of lines, the license
  # information will contain. But max two paragraphs are added.
  maxlinesaftercopyright = 10

  basedir = os.getcwd()
  
  # Here I initialize the three lists, which will hold all our wanted info.
  # These variables will be directly manipulated in the functions.
  # Yes, I know, this is not the correct way - but it is a quick way... ;-)
  files = []
  copyrights = []
  licenses = []
  
  # Let's  start the magic...
  checkdir(basedir)
  
  # Now output the result to STDOUT.
  for i in range(len(copyrights)):
    print 'Files: '+files[i]
    print 'Copyright: '+copyrights[i]
    print 'License: '+licenses[i]

